Esempio n. 1
0
void bamFetchAlreadyOpen(samfile_t *samfile, bam_index_t *idx, char *bamFileName,
                         char *position, bam_fetch_f callbackFunc, void *callbackData)
/* With the open bam file, return items the same way with the callbacks as with bamFetch() */
/* except in this case use an already-open bam file and index (use bam_index_load and free() for */
/* the index). It seems a little strange to pass the filename in with the open bam, but */
/* it's just used to report errors. */
{
    int chromId, start, end;
    int ret = bam_parse_region(samfile->header, position, &chromId, &start, &end);
    if (ret != 0 && startsWith("chr", position))
        ret = bam_parse_region(samfile->header, position+strlen("chr"), &chromId, &start, &end);
    if (ret != 0)
        // If the bam file does not cover the current chromosome, OK
        return;
    ret = bam_fetch(samfile->x.bam, idx, chromId, start, end, callbackData, callbackFunc);
    if (ret != 0)
        warn("bam_fetch(%s, %s (chromId=%d) failed (%d)", bamFileName, position, chromId, ret);
}
Esempio n. 2
0
void
bam_streamer::
set_new_region(const char* region)
{
    int ref,beg,end;
    bam_parse_region(_bfp->header, region, &ref, &beg, &end); // parse the region

    set_new_region(ref,beg,end);
    _region=region;
}
Esempio n. 3
0
int MyBamWrap::myPassRegion(region_t & region,string& chrName, uint32_t& lpos, uint32_t& rpos)
{
	char tmp[1024]="";
	char tmpnum[128]="";
	strcat(tmp,chrName.c_str());
	strcat(tmp,":");
	sprintf(&tmpnum[0],"%u",lpos);
	strcat(tmp,tmpnum);
	strcat(tmp,"-");
	sprintf(&tmpnum[0],"%u",rpos);
	strcat(tmp,tmpnum);
	bam_parse_region(in->header, tmp, &region.tid,  &region.lpos , &region.rpos);
	return 0;
}
Esempio n. 4
0
bam_streamer::
bam_streamer(
    const char* filename,
    const char* region)
    : _is_record_set(false),
      _bfp(nullptr),
      _hidx(nullptr),
      _hitr(nullptr),
      _record_no(0),
      _stream_name(filename),
      _is_region(false)
{
    assert(nullptr != filename);
    if ('\0' == *filename)
    {
        throw blt_exception("Can't initialize bam_streamer with empty filename\n");
    }

    _bfp = samopen(filename, "rb", 0);

    if (nullptr == _bfp)
    {
        log_os << "ERROR: Failed to open SAM/BAM/CRAM file: " << filename << "\n";
        exit(EXIT_FAILURE);
    }

    if (nullptr == region)
    {
        // read the whole BAM file:

        if (_bfp->header->n_targets)
        {
            // parse a fake region so that header->hash is created
            std::string fake_region(target_id_to_name(0));
            fake_region += ":1-1";
            int ref,beg,end;
            bam_parse_region(_bfp->header, fake_region.c_str(), &ref, &beg, &end);
        }
        return;
    }

    // read a specific region of the bam file:
    set_new_region(region);
}
Esempio n. 5
0
void seekRegion(BamReaderData * data) {
	int tid, beg, end;

	if (data->conf->reg) {
		// Create BAM iterator at region
		if (bam_parse_region(data->data->h, data->conf->reg, &tid, &beg, &end) < 0) {
			fprintf(stderr, "[%s] malformatted region or wrong seqname for input.\n", __func__);
			exit(1);
		}
		data->data->iter = bam_iter_query(data->idx, tid, beg, end);
		data->ref_tid = tid;
	} else {
		// Create general BAM iterator
		data->data->iter = NULL;
	}

	// Create pileup iterator	
	data->iter = bam_mplp_init(1, mplp_func, (void**) &data->data);
}
Esempio n. 6
0
int sam_fetch(char *ifn, char *ofn, char *reg, void *data, sam_fetch_f func) {
	int ret = 0;
	samfile_t *in = samopen(ifn, "rb", 0);
	samfile_t *out = 0;
	if (ofn) out = samopen(ofn, "wb", in->header);

	if (reg) {
		bam_index_t *idx = bam_index_load(ifn);
		if (idx == 0) {
			fprintf(stderr, "[%s:%d] Random alignment retrieval only works for indexed BAM files.\n",
							__func__, __LINE__);
			exit(1);
		}
		int tid, beg, end;
		bam_parse_region(in->header, reg, &tid, &beg, &end);
		if (tid < 0) {
			fprintf(stderr, "[%s:%d] Region \"%s\" specifies an unknown reference name. \n",
							__func__, __LINE__, reg);
			exit(1);
		}
		bam_iter_t iter;
		bam1_t *b = bam_init1();
		iter = bam_iter_query(idx, tid, beg, end);
		while ((ret = bam_iter_read(in->x.bam, iter, b)) >= 0) func(b, in, out, data);
		bam_iter_destroy(iter);
		bam_destroy1(b);
		bam_index_destroy(idx);
	} else {
		bam1_t *b = bam_init1();
		while ((ret = samread(in, b)) >= 0) func(b, in, out, data);
		bam_destroy1(b);
	}
	if (out) samclose(out);
	samclose(in);
			
	if (ret != -1) {					/* truncated is -2 */
		fprintf(stderr, "[%s:%d] Alignment retrieval failed due to truncated file\n",
						__func__, __LINE__);
		exit(1);
	}

	return ret;
}
int main(int argc, char *argv[])  
{  
	tmpstruct_t tmp;  
	if (argc == 1) {  
		fprintf(stderr, "Usage: calDepth <in.bam> [region]\n");  
		return 1;  
	}  
	tmp.beg = 0; tmp.end = 0x7fffffff;  
	tmp.in = samopen(argv[1], "rb", 0);  
	if (tmp.in == 0) {  
		fprintf(stderr, "Fail to open BAM file %s\n", argv[1]);  
		return 1;  
	}  
	if (argc == 2) { // if a region is not specified  
		sampileup(tmp.in, -1, pileup_func, &tmp);  
	} else {  
		int ref;  
		bam_index_t *idx;  
		bam_plbuf_t *buf;  
		idx = bam_index_load(argv[1]); // load BAM index  
		if (idx == 0) {  
			fprintf(stderr, "BAM indexing file is not available.\n");  
			return 1;  
		}  
		bam_parse_region(tmp.in->header, argv[2], &ref,  
										 &tmp.beg, &tmp.end); // parse the region  
		if (ref < 0) {  
			fprintf(stderr, "Invalid region %s\n", argv[2]);  
			return 1;  
		}  
		buf = bam_plbuf_init(pileup_func, &tmp); // initialize pileup  
		bam_fetch(tmp.in->x.bam, idx, ref, tmp.beg, tmp.end, buf, fetch_func);  
		bam_plbuf_push(0, buf); // finalize pileup  
		bam_index_destroy(idx);  
		bam_plbuf_destroy(buf);  
	}  
	samclose(tmp.in);  
	return 0;

}
Esempio n. 8
0
bam_streamer::
bam_streamer(const char* filename,
             const char* region)
    : _is_record_set(false), _bfp(NULL), _bidx(NULL), _biter(NULL),
      _record_no(0), _stream_name(filename), _is_region(false)
{

    assert(NULL != filename);
    assert('\0' != *filename);

    _bfp = samopen(filename, "rb", 0);

    if (NULL == _bfp)
    {
        log_os << "ERROR: Failed to open SAM/BAM file: " << filename << "\n";
        exit(EXIT_FAILURE);
    }


    if (NULL == region)
    {
        // read the whole BAM file:

        if (_bfp->header->n_targets)
        {
            // parse a fake region so that header->hash is created
            std::string fake_region(target_id_to_name(0));
            fake_region += ":1-1";
            int ref,beg,end;
            bam_parse_region(_bfp->header, fake_region.c_str(), &ref, &beg, &end);
        }
        return;
    }

    // read a specific region of the bam file:
    set_new_region(region);
}
Esempio n. 9
0
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
	extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
	extern void bcf_call_del_rghash(void *rghash);
	mplp_aux_t **data;
	int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
	const bam_pileup1_t **plp;
	bam_mplp_t iter;
	bam_header_t *h = 0;
	char *ref;
	void *rghash = 0;

	bcf_callaux_t *bca = 0;
	bcf_callret1_t *bcr = 0;
	bcf_call_t bc;
	bcf_t *bp = 0;
	bcf_hdr_t *bh = 0;

	bam_sample_t *sm = 0;
	kstring_t buf;
	mplp_pileup_t gplp;

	memset(&gplp, 0, sizeof(mplp_pileup_t));
	memset(&buf, 0, sizeof(kstring_t));
	memset(&bc, 0, sizeof(bcf_call_t));
	data = calloc(n, sizeof(void*));
	plp = calloc(n, sizeof(void*));
	n_plp = calloc(n, sizeof(int*));
	sm = bam_smpl_init();

	// read the header and initialize data
	for (i = 0; i < n; ++i) {
		bam_header_t *h_tmp;
		data[i] = calloc(1, sizeof(mplp_aux_t));
		data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
		data[i]->conf = conf;
		h_tmp = bam_header_read(data[i]->fp);
		data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
		bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
		rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
		if (conf->reg) {
			int beg, end;
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			if (idx == 0) {
				fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
				fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
			data[i]->iter = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
		if (i == 0) h = h_tmp;
		else {
			// FIXME: to check consistency
			bam_header_destroy(h_tmp);
		}
	}
	gplp.n = sm->n;
	gplp.n_plp = calloc(sm->n, sizeof(int));
	gplp.m_plp = calloc(sm->n, sizeof(int));
	gplp.plp = calloc(sm->n, sizeof(void*));

	fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
	// write the VCF header
	if (conf->flag & MPLP_GLF) {
		kstring_t s;
		bh = calloc(1, sizeof(bcf_hdr_t));
		s.l = s.m = 0; s.s = 0;
		bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
		for (i = 0; i < h->n_targets; ++i) {
			kputs(h->target_name[i], &s);
			kputc('\0', &s);
		}
		bh->l_nm = s.l;
		bh->name = malloc(s.l);
		memcpy(bh->name, s.s, s.l);
		s.l = 0;
		for (i = 0; i < sm->n; ++i) {
			kputs(sm->smpl[i], &s); kputc('\0', &s);
		}
		bh->l_smpl = s.l;
		bh->sname = malloc(s.l);
		memcpy(bh->sname, s.s, s.l);
		bh->txt = malloc(strlen(BAM_VERSION) + 64);
		bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
		free(s.s);
		bcf_hdr_sync(bh);
		bcf_hdr_write(bp, bh);
		bca = bcf_call_init(-1., conf->min_baseQ);
		bcr = calloc(sm->n, sizeof(bcf_callret1_t));
		bca->rghash = rghash;
		bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
		bca->min_frac = conf->min_frac;
		bca->min_support = conf->min_support;
	}
	if (tid0 >= 0 && conf->fai) { // region is set
		ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
		ref_tid = tid0;
		for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
	} else ref_tid = -1, ref = 0;
	iter = bam_mplp_init(n, mplp_func, (void**)data);
	max_depth = conf->max_depth;
	if (max_depth * sm->n > 1<<20)
		fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
	if (max_depth * sm->n < 8000) {
		max_depth = 8000 / sm->n;
		fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
	}
	max_indel_depth = conf->max_indel_depth * sm->n;
	bam_mplp_set_maxcnt(iter, max_depth);
	while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
		if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
		if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
		if (tid != ref_tid) {
			free(ref); ref = 0;
			if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
			for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
			ref_tid = tid;
		}
		if (conf->flag & MPLP_GLF) {
			int total_depth, _ref0, ref16;
			bcf1_t *b = calloc(1, sizeof(bcf1_t));
			for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
			group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
			_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
			ref16 = bam_nt16_table[_ref0];
			for (i = 0; i < gplp.n; ++i)
				bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
			bcf_call_combine(gplp.n, bcr, ref16, &bc);
			bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
						 (conf->flag&MPLP_FMT_SP), 0, 0);
			bcf_write(bp, bh, b);
			bcf_destroy(b);
			// call indels
			if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
				for (i = 0; i < gplp.n; ++i)
					bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
				if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
					b = calloc(1, sizeof(bcf1_t));
					bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
								 (conf->flag&MPLP_FMT_SP), bca, ref);
					bcf_write(bp, bh, b);
					bcf_destroy(b);
				}
			}
		} else {
			printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
			for (i = 0; i < n; ++i) {
				int j;
				printf("\t%d\t", n_plp[i]);
				if (n_plp[i] == 0) {
					printf("*\t*"); // FIXME: printf() is very slow...
					if (conf->flag & MPLP_PRINT_POS) printf("\t*");
				} else {
					for (j = 0; j < n_plp[i]; ++j)
						pileup_seq(plp[i] + j, pos, ref_len, ref);
					putchar('\t');
					for (j = 0; j < n_plp[i]; ++j) {
						const bam_pileup1_t *p = plp[i] + j;
						int c = bam1_qual(p->b)[p->qpos] + 33;
						if (c > 126) c = 126;
						putchar(c);
					}
					if (conf->flag & MPLP_PRINT_MAPQ) {
						putchar('\t');
						for (j = 0; j < n_plp[i]; ++j) {
							int c = plp[i][j].b->core.qual + 33;
							if (c > 126) c = 126;
							putchar(c);
						}
					}
					if (conf->flag & MPLP_PRINT_POS) {
						putchar('\t');
						for (j = 0; j < n_plp[i]; ++j) {
							if (j > 0) putchar(',');
							printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
						}
					}
				}
			}
			putchar('\n');
		}
	}

	bcf_close(bp);
	bam_smpl_destroy(sm); free(buf.s);
	for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
	free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
	bcf_call_del_rghash(rghash);
	bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
	bam_mplp_destroy(iter);
	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(plp); free(ref); free(n_plp);
	return 0;
}
Esempio n. 10
0
int main_depth(int argc, char *argv[])
#endif
{
	int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0;
	const bam_pileup1_t **plp;
	char *reg = 0; // specified region
	void *bed = 0; // BED data structure
	bam_header_t *h = 0; // BAM header of the 1st input
	aux_t **data;
	bam_mplp_t mplp;

	// parse the command line
	while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) {
		switch (n) {
			case 'r': reg = strdup(optarg); break;   // parsing a region requires a BAM header
			case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
			case 'q': baseQ = atoi(optarg); break;   // base quality threshold
			case 'Q': mapQ = atoi(optarg); break;    // mapping quality threshold
		}
	}
	if (optind == argc) {
		fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n");
		return 1;
	}

	// initialize the auxiliary data structures
	n = argc - optind; // the number of BAMs on the command line
	data = (aux_t **) calloc(n, sizeof(void*)); // data[i] for the i-th input
	beg = 0; end = 1<<30; tid = -1;  // set the default region
	for (i = 0; i < n; ++i) {
		bam_header_t *htmp;
		data[i] = (aux_t *) calloc(1, sizeof(aux_t));
		data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
		data[i]->min_mapQ = mapQ;                    // set the mapQ filter
		htmp = bam_header_read(data[i]->fp);         // read the BAM header
		if (i == 0) {
			h = htmp; // keep the header of the 1st BAM
			if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region
		} else bam_header_destroy(htmp); // if not the 1st BAM, trash the header
		if (tid >= 0) { // if a region is specified and parsed successfully
			bam_index_t *idx = bam_index_load(argv[optind+i]);  // load the index
			data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator
			bam_index_destroy(idx); // the index is not needed any more; phase out of the memory
		}
	}

	// the core multi-pileup loop
	mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
	n_plp = (int*) calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
	plp = (bam_pileup1_t **) calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
	while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
		if (pos < beg || pos >= end) continue; // out of range; skip
		if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
		fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
		for (i = 0; i < n; ++i) { // base level filters have to go here
			int j, m = 0;
			for (j = 0; j < n_plp[i]; ++j) {
				const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
				if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
				else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
			}
			printf("\t%d", n_plp[i] - m); // this the depth to output
		}
		putchar('\n');
	}
	free(n_plp); free(plp);
	bam_mplp_destroy(mplp);

	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(reg);
	if (bed) bed_destroy(bed);
	return 0;
}
Esempio n. 11
0
int main_depth(int argc, char *argv[])
#endif
{
	int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles;
	const bam_pileup1_t **plp;
	char *reg = 0; // specified region
	void *bed = 0; // BED data structure
    char *file_list = NULL, **fn = NULL;
	bam_header_t *h = 0; // BAM header of the 1st input
	aux_t **data;
	bam_mplp_t mplp;

	// parse the command line
	while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) {
		switch (n) {
			case 'l': min_len = atoi(optarg); break; // minimum query length
			case 'r': reg = strdup(optarg); break;   // parsing a region requires a BAM header
			case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
			case 'q': baseQ = atoi(optarg); break;   // base quality threshold
			case 'Q': mapQ = atoi(optarg); break;    // mapping quality threshold
			case 'f': file_list = optarg; break;
		}
	}
	if (optind == argc && !file_list) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
        fprintf(stderr, "Options:\n");
        fprintf(stderr, "   -b <bed>            list of positions or regions\n");
        fprintf(stderr, "   -f <list>           list of input BAM filenames, one per line [null]\n");
        fprintf(stderr, "   -l <int>            minQLen\n");
        fprintf(stderr, "   -q <int>            base quality threshold\n");
        fprintf(stderr, "   -Q <int>            mapping quality threshold\n");
        fprintf(stderr, "   -r <chr:from-to>    region\n");
        fprintf(stderr, "\n");
		return 1;
	}

	// initialize the auxiliary data structures
    if (file_list) 
    {
        if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
        n = nfiles;
        argv = fn;
        optind = 0;
    }
    else
        n = argc - optind; // the number of BAMs on the command line
	data = calloc(n, sizeof(void*)); // data[i] for the i-th input
	beg = 0; end = 1<<30; tid = -1;  // set the default region
	for (i = 0; i < n; ++i) {
		bam_header_t *htmp;
		data[i] = calloc(1, sizeof(aux_t));
		data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
		data[i]->min_mapQ = mapQ;                    // set the mapQ filter
		data[i]->min_len  = min_len;                 // set the qlen filter
		htmp = bam_header_read(data[i]->fp);         // read the BAM header
		if (i == 0) {
			h = htmp; // keep the header of the 1st BAM
			if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region
		} else bam_header_destroy(htmp); // if not the 1st BAM, trash the header
		if (tid >= 0) { // if a region is specified and parsed successfully
			bam_index_t *idx = bam_index_load(argv[optind+i]);  // load the index
			data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator
			bam_index_destroy(idx); // the index is not needed any more; phase out of the memory
		}
	}

	// the core multi-pileup loop
	mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
	bam_mplp_set_maxcnt(mplp,2147483647); // set max_depth to int max
	n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
	plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
	while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
		if (pos < beg || pos >= end) continue; // out of range; skip
		if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
		fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
		for (i = 0; i < n; ++i) { // base level filters have to go here
			int j, m = 0;
			for (j = 0; j < n_plp[i]; ++j) {
				const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
				if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
				else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
			}
			printf("\t%d", n_plp[i] - m); // this the depth to output
		}
		putchar('\n');
	}
	free(n_plp); free(plp);
	bam_mplp_destroy(mplp);

	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(reg);
	if (bed) bed_destroy(bed);
    if ( file_list )
    {
        for (i=0; i<n; i++) free(fn[i]);
        free(fn);
    }
	return 0;
}
Esempio n. 12
0
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level)
#endif
{
	bamFile fpout, *fp;
	heap1_t *heap;
	bam_header_t *hout = 0;
	bam_header_t *hheaders = NULL;
	int i, j, *RG_len = 0;
	uint64_t idx = 0;
	char **RG = 0, mode[8];
	bam_iter_t *iter = 0;

	if (headers) {
		tamFile fpheaders = sam_open(headers);
		if (fpheaders == 0) {
			const char *message = strerror(errno);
			fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
			return -1;
		}
		hheaders = sam_header_read(fpheaders);
		sam_close(fpheaders);
	}

	g_is_by_qname = by_qname;
	fp = (bamFile*)calloc(n, sizeof(bamFile));
	heap = (heap1_t*)calloc(n, sizeof(heap1_t));
	iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
	// prepare RG tag
	if (flag & MERGE_RG) {
		RG = (char**)calloc(n, sizeof(void*));
		RG_len = (int*)calloc(n, sizeof(int));
		for (i = 0; i != n; ++i) {
			int l = strlen(fn[i]);
			const char *s = fn[i];
			if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
			for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
			++j; l -= j;
			RG[i] = calloc(l + 1, 1);
			RG_len[i] = l;
			strncpy(RG[i], s + j, l);
		}
	}
	// read the first
	for (i = 0; i != n; ++i) {
		bam_header_t *hin;
		fp[i] = bam_open(fn[i], "r");
		if (fp[i] == 0) {
			int j;
			fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
			for (j = 0; j < i; ++j) bam_close(fp[j]);
			free(fp); free(heap);
			// FIXME: possible memory leak
			return -1;
		}
		hin = bam_header_read(fp[i]);
		if (i == 0) { // the first BAM
			hout = hin;
		} else { // validate multiple baf
			int min_n_targets = hout->n_targets;
			if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;

			for (j = 0; j < min_n_targets; ++j)
				if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
					fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
							hout->target_name[j], hin->target_name[j], fn[i]);
					return -1;
				}

			// If this input file has additional target reference sequences,
			// add them to the headers to be output
			if (hin->n_targets > hout->n_targets) {
				swap_header_targets(hout, hin);
				// FIXME Possibly we should also create @SQ text headers
				// for the newly added reference sequences
			}

			bam_header_destroy(hin);
		}
	}

	if (hheaders) {
		// If the text headers to be swapped in include any @SQ headers,
		// check that they are consistent with the existing binary list
		// of reference information.
		if (hheaders->n_targets > 0) {
			if (hout->n_targets != hheaders->n_targets) {
				fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
				if (!reg) return -1;
			}
			for (j = 0; j < hout->n_targets; ++j)
				if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
					fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
					if (!reg) return -1;
				}
		}

		swap_header_text(hout, hheaders);
		bam_header_destroy(hheaders);
	}

	if (reg) {
		int tid, beg, end;
		if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
			fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
			return -1;
		}
		for (i = 0; i < n; ++i) {
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			iter[i] = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
	}

	for (i = 0; i < n; ++i) {
		heap1_t *h = heap + i;
		h->i = i;
		h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
		if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
			h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b);
			h->idx = idx++;
		}
		else h->pos = HEAP_EMPTY;
	}
	if (flag & MERGE_UNCOMP) level = 0;
	else if (flag & MERGE_LEVEL1) level = 1;
	strcpy(mode, "w");
	if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
	if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) {
		fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
		return -1;
	}
	bam_header_write(fpout, hout);
	bam_header_destroy(hout);
#ifndef _PBGZF_USE 
	if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256);
#endif

	ks_heapmake(heap, n, heap);
	while (heap->pos != HEAP_EMPTY) {
		bam1_t *b = heap->b;
		if (flag & MERGE_RG) {
			uint8_t *rg = bam_aux_get(b, "RG");
			if (rg) bam_aux_del(b, rg);
			bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
		}
		bam_write1_core(fpout, &b->core, b->data_len, b->data);
		if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
			heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b);
			heap->idx = idx++;
		} else if (j == -1) {
			heap->pos = HEAP_EMPTY;
			free(heap->b->data); free(heap->b);
			heap->b = 0;
		} else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
		ks_heapadjust(heap, 0, n, heap);
	}

	if (flag & MERGE_RG) {
		for (i = 0; i != n; ++i) free(RG[i]);
		free(RG); free(RG_len);
	}
	for (i = 0; i != n; ++i) {
		bam_iter_destroy(iter[i]);
		bam_close(fp[i]);
	}
	bam_close(fpout);
	free(fp); free(heap); free(iter);
	return 0;
}
Esempio n. 13
0
BamX::BamX(pars & Params1)	// optional constructor
{
    // parameters
    Params=Params1;
    Nread=0;
    Npair=0;
    Nproper=0;
    Nout=0;
    LFlow=INT_MIN;
    LFhigh=INT_MAX;
    region.limit=false;
    IlluminizeBam=0;

    outFragTailBam=false;
    outInterChromBam=false;
    outUniqueMultipleBam=false;
    outUniquePartialBam=false;
    outUniqueUnmappedBam=false;
    outAllPairsBam=false;
    outReadPairPosBam=false;
    
    //output file
	//samfile_t *fp;
    bam_header_t *bam_header;
    
    string s = Params.getInput();
    BamUtil bam1(s);
    Bam = bam1;

    string filename=extractfilename(s);
    
    // parameters
    string fragPosFile = Params.getString("ReadPairPosFile");
    string r = Params.getString("ChromRegion");
    int maxReads = Params.getInt("MaxReads");
    Qmin = Params.getInt("Qmin");
    LRmin = Params.getInt("MinReadLength");
    maxmismatchPC=Params.getDouble("FractionMaxMisMatches");
    FragLengthWindow=Params.getInt("FragmentLengthWindow");
    int cmd_MateMode=Params.getInt("ReadPairSenseConfig");
    string ReferenceFastaFile=Params.getString("ReferenceFastaFile");
    FragmentTailPercent=Params.getDouble("FragmentTailPercent");
    IlluminizeBam=Params.getInt("Illuminize")>0;
    outputDirectory=Params.getString("OutputDirectory");
    int minLR=Params.getInt("MinReadLength"); 
    int SplitBracketMin=Params.getInt("SplitBracketMin"); 
    int SplitBaseQmin=Params.getInt("SplitBaseQmin"); 
    
    string StatFile=Params.getString("StatFile");
    if (StatFile.size()>0) {
        hists H1(StatFile);
        hist HLF=H1.h["LF"];
        hist HLR=H1.h["LR"];
        Params.setHist("LF",HLF);
        Params.setHist("LR",HLR);        
        H1.h.clear();  // free some memory 
        if (FragmentTailPercent>0) {
            LFlow=int(HLF.p2xTrim(FragmentTailPercent/100.));   
            LFhigh=int(HLF.p2xTrim(1-FragmentTailPercent/100.));   
        }
    }
    
    int dbg = Params.getInt("Dbg");
    time(&tprev);
    
    if (ReferenceFastaFile.size()>0) {
        FastaObj RF1(ReferenceFastaFile, "");
        Reference=RF1;
        RF1.seq.clear();  // free some memory 
    }
    
    bam_header= Bam.fp->header;
    string bamheadertext = bam_header->text;
    ReadGroup = extractBamTag(bamheadertext,"@RG");
    
    outAllPairsBam=(r.size()>0);
    if (!outAllPairsBam) { 
        outFragTailBam=true; //FragmentTailPercent>=0;
        outInterChromBam=true;
        outUniqueMultipleBam=true;
        outUniquePartialBam=true;
        outUniqueUnmappedBam=true;
    }
    // output Bams
    outputBam.clear();
    
    /*
    // test BamHeaderContainer
    vector<BamHeaderContainer> x;
    string sv=SpannerVersion;    
    string q="@PG\tID:FragmentTail\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine();
    while (true) {
        string outfile=outputDirectory+"/"+filename+".fragtail.bam";
        q=q+"\n@PG\tID:FragmentTail\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine();
        BamHeaderContainer x1( bam_header, q); 
        x.push_back(x1);
        bam_header_t* h1=x[x.size()-1].header();
        cout<< h1->text << endl;
    }
    cout << x.size() << endl;
    */
    
    samfile_t *fpFT=0;
    samfile_t *fpIC=0;
    samfile_t *fpUM=0;
    samfile_t *fpUP=0;
    samfile_t *fpUZ=0;
    samfile_t *fpAP=0;
    samfile_t *fpWP=0;
    
    //region
    if (r.size()>0) {
        int r1,r2,r3;
        C_region r0(r); 
        region=r0;
        string bamRegion=region.region;
        size_t k=bamRegion.find("chr");
        if (k!=string::npos) {
            bamRegion=bamRegion.substr(3);
        }

        if ( bam_parse_region(bam_header, bamRegion.c_str(), &r1, &r2, &r3)==0) {
            region.limit=true;
            region.anchor=r1;
            region.start=r2;
            region.end=r3;
        } else {
            cerr << "region not found\t" << r << endl;
            exit(111);
        }
        
    }
    
    
    //fragPosFile
    if (fragPosFile.size()>0) {
        
        FragmentPosFileObj fp(fragPosFile);
        if (fp.fragmentPosList.size()>0) {
            FragPos=fp;
        } else {
            cerr << "Read Pair Pos file not found\t" <<  fragPosFile << endl;
            exit(112);
        }
        outFragTailBam=false; 
        outInterChromBam=false;
        outUniqueMultipleBam=false;
        outUniquePartialBam=false;
        outUniqueUnmappedBam=false;
        outReadPairPosBam=true;
        
    }

    
    if (outAllPairsBam) {
        string outfile=outputDirectory+"/"+filename+"."+r+".bam";
        string sv=SpannerVersion;
        string q="@PG\tID:Region\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine();
        outputBam["AP"]=BamHeaderContainer(bam_header,q); 
        bam_header_t* h1=outputBam["AP"].header();
        if ((fpAP = samopen(outfile.c_str(), "wb", h1)) == 0) {
            fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str());
            exit(160);
        }
    }

    
    if (outFragTailBam) {
        string outfile=outputDirectory+"/"+filename+".fragtail.bam";
        string sv=SpannerVersion;
        string q="@PG\tID:FragmentTail\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine();
        outputBam["FT"]=BamHeaderContainer(bam_header,q); 
        bam_header_t* h1=outputBam["FT"].header();
        if ((fpFT = samopen(outfile.c_str(), "wb", h1)) == 0) {
            fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str());
            exit(161);
        }
    }
     
    if (outInterChromBam) {
        string outfile=outputDirectory+"/"+filename+".interchrom.bam";
        string sv=SpannerVersion;
        string q="@PG\tID:InterChromPairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine();
        outputBam["IC"]=BamHeaderContainer(bam_header,q);   
        bam_header_t* h1=outputBam["IC"].header();
        if ((fpIC = samopen(outfile.c_str(), "wb", h1)) == 0) {
            fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str());
            exit(162);
        }
    }
    
    if (outUniqueMultipleBam) {
        string outfile=outputDirectory+"/"+filename+".uMult.bam";
        string sv=SpannerVersion;
        string q="@PG\tID:uniqMultiplyMappedPairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine();
        outputBam["UM"]=BamHeaderContainer(bam_header,q); 
        bam_header_t* h1=outputBam["IUM"].header();
        if ((fpUM = samopen(outfile.c_str(), "wb", h1)) == 0) {
            fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str());
            exit(163);
        }
    }
    
    if (outUniquePartialBam) {        
        string outfile=outputDirectory+"/"+filename+".uPart.bam";
        string sv=SpannerVersion;
        string q="@PG\tID:uniqPartiallyMappedPairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine();
        outputBam["UP"]=BamHeaderContainer(bam_header,q);  
        bam_header_t* h1=outputBam["UP"].header();
        if ((fpUP = samopen(outfile.c_str(), "wb", h1)) == 0) {
            fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str());
            exit(164);
        }
    }

    if (outUniqueUnmappedBam) {        
        string outfile=outputDirectory+"/"+filename+".uUnmapped.bam";
        string sv=SpannerVersion;
        string q="@PG\tID:uniqUnMappedPairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine();
        outputBam["UZ"]=BamHeaderContainer(bam_header,q); 
        bam_header_t* h1=outputBam["UZ"].header();
        if ((fpUZ = samopen(outfile.c_str(), "wb", h1)) == 0) {
            fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str());
            exit(165);
        }

    }

    if (outReadPairPosBam) {        
        string outfile=outputDirectory+"/"+filename+".weirdpairs.bam";
        string sv=SpannerVersion;
        string q="@PG\tID:weirdpairs\tPN:SpannerX\tVN"+sv+"\tCL:"+Params.getCmdLine();
        outputBam["WP"]=BamHeaderContainer(bam_header,q); 
        bam_header_t* h1=outputBam["WP"].header();
        if ((fpWP = samopen(outfile.c_str(), "wb", h1)) == 0) {
            fprintf(stderr, "samopen: Fail to open output BAM file %s\n", filename.c_str());
            exit(165);
        }
        
    }

    
    cout << ReadGroup << endl << endl;
    
    //extractMateMode();
    
    if (cmd_MateMode>=0)   MateMode=cmd_MateMode;          
     
    BamContainerPair bampair;
    
    bool more = true;
    while (more)
    {
        bampair=Bam.getNextBamPair();
        // skip if neither end within region
        more=(bampair.BamEnd.size()>1);
        
        Npair++;
        if (Npair>=maxReads) break; 

        //
        if ( (dbg!=0)&&(elapsedtime()>float(dbg))) {
			time(&tprev);
			cout << " pairs:" << Npair << "\toutput:" << Nout;
			cout << "\tchr:" << bampair.BamEnd[0].b.core.tid+1;
            cout << "\tpos:" << bampair.BamEnd[0].b.core.pos;
            cout << endl;			
		}  
        
        if (!more) continue; 
        if (region.limit) {
            bool overlap = false;
            for (int e=0; e<=1; e++) {                 
                int a1=bampair.BamEnd[e].b.core.tid;
                int p1=bampair.BamEnd[e].b.core.pos;
                int p2=p1+bampair.BamEnd[e].len;
                overlap=region.overlap(a1,p1,p2);
                if (overlap) break; 
            }        
            if (!overlap) continue;
        }
    
        
        bampair.Illuminize(IlluminizeBam);  
        bampair.calcFragmentLengths();
        more=(bampair.BamEnd[1].packeddata.size()>1);
        //if (bampair.BamEnd[0].b.core.tid==bampair.BamEnd[1].b.core.tid) 
        //    cout<< bampair << endl;
        
        bool bothmap = ((bampair.BamEnd[0].b.core.flag&BAM_FUNMAP)==0)&&((bampair.BamEnd[0].b.core.flag&BAM_FMUNMAP)==0);
            
        
        if (outAllPairsBam) {
            Nout++;
            int s1=samwrite(fpAP, &(bampair.BamEnd[0].b));
            int s2=samwrite(fpAP, &(bampair.BamEnd[1].b));
            if ((s1*s2)>0) {
                continue;
            } else {
                cerr << "bad write to pairs.bam" << endl;
                exit(150);
            }
        }

        
        if (outReadPairPosBam) {
            int ichr1=bampair.BamEnd[0].b.core.tid+1;
            int istd1=bampair.BamEnd[0].sense=='+'? 0: 1;
            int ista1=bampair.BamEnd[0].b.core.pos+1;
            int iq1=bampair.BamEnd[0].q;
            int ichr2=bampair.BamEnd[1].b.core.tid+1;
            int istd2=bampair.BamEnd[1].sense=='+'? 0: 1;
            int ista2=bampair.BamEnd[1].b.core.pos+1;
            int iq2=bampair.BamEnd[1].q;
            
            FragmentPosObj  fp1(0,ichr1,istd1,ista1,0,ichr2,istd2,ista2,0,iq1, iq2,0);
            
            /*
             if ((fp1.chr1==10)&&(fp1.start1>=89687801)&&(fp1.end1<=89700722)) {
                cout << "read "<< fp1 << endl;                
            }
            */
            if (FragPos.find(fp1)) {
                Nout++;
                int s1=samwrite(fpWP, &(bampair.BamEnd[0].b));
                int s2=samwrite(fpWP, &(bampair.BamEnd[1].b));
                if ((s1*s2)>0) {
                    continue;
                } else {
                    cerr << "bad write to weirdpairs.bam" << endl;
                    exit(156);
                }
            }
        }        
        bool ok[2];
        for (int e=0; e<2; e++) {
            uint8_t*  bq=bam1_qual(&(bampair.BamEnd[e].b));
            int LR=bampair.BamEnd[0].b.core.l_qseq;
            double bok=0;
            for (int ib=0; ib<LR; ib++) {
                if (bq[ib]>SplitBaseQmin) {
                    bok++;
                }
            }
            ok[e]=(bok>LRmin);
        }
        
        if (! (ok[0]&ok[1]) )
            continue;
        
        if ( (outFragTailBam) & ((bampair.BamEnd[0].q>=Qmin)|(bampair.BamEnd[1].q>=Qmin)) ) {            
            bool FT=(bampair.FragmentLength>LFhigh)|((bampair.FragmentLength<LFlow)&(bampair.FragmentLength>INT_MIN))&bothmap;
            if (FT && (fpFT!=0)) {
                Nout++;
                int s1=samwrite(fpFT, &(bampair.BamEnd[0].b));
                int s2=samwrite(fpFT, &(bampair.BamEnd[1].b));
                //if (outputBam["FT"].write(&(bampair.BamEnd[0].b),&(bampair.BamEnd[1].b))) {
                if ((s1*s2)>0) {
                    continue;
                } else {
                    cerr << "bad write to fragtail.bam" << endl;
                    exit(151);
                }
            }
        }
        
        if ((outInterChromBam) & ((bampair.BamEnd[0].q>=Qmin)&(bampair.BamEnd[1].q>=Qmin))) { 
            bool IC=(bampair.BamEnd[0].b.core.tid!=bampair.BamEnd[1].b.core.tid)&&bothmap;
            if (IC && (fpIC!=0)) {
                Nout++;
                int s1=samwrite(fpIC, &(bampair.BamEnd[0].b));
                int s2=samwrite(fpIC, &(bampair.BamEnd[1].b));
                if ((s1*s2)>0) {
                    continue;
                } else {
                    cerr << "bad write to interchrom.bam" << endl;
                    exit(152);
                }
            }
        }
        if ((outUniqueMultipleBam) & ((bampair.BamEnd[0].q>=Qmin)|(bampair.BamEnd[1].q>=Qmin))){
            int im=bampair.BamEnd[0].nmap>1? 0: 1;
            int iu=bampair.BamEnd[0].q>=Qmin? 0: 1;
            bool UM=(bampair.BamEnd[iu].nmap>1)&&(iu!=im)&&bothmap;            
            if (UM && (fpUM!=0)) {
                Nout++;
                int s1=samwrite(fpUM, &(bampair.BamEnd[0].b));
                int s2=samwrite(fpUM, &(bampair.BamEnd[1].b));
                if ((s1*s2)>0) {
                    continue;
                } else {
                    cerr << "bad write to uMult.bam" << endl;
                    exit(153);
                }
            }
        }
        if ( (outUniquePartialBam) && ((bampair.BamEnd[0].q>=Qmin)|(bampair.BamEnd[1].q>=Qmin)) && bothmap) {            
            int c0=bampair.BamEnd[0].clip[0]+bampair.BamEnd[0].clip[1];
            int LR=bampair.BamEnd[0].b.core.l_qseq;
            bool split0=((LR-c0)>SplitBracketMin)&(c0>SplitBracketMin);
            int ib0=0;
            if ((split0)&(bampair.BamEnd[0].clip[0]>SplitBracketMin)) {
                ib0=bampair.BamEnd[0].clip[0];
            } else if ((split0)&(bampair.BamEnd[0].clip[1]>SplitBracketMin) ) {
                ib0=LR-bampair.BamEnd[0].clip[1];
            }
            split0=split0&(ib0>0);
            if (split0) {
                uint8_t*  bq=bam1_qual(&(bampair.BamEnd[0].b));
                for (int ib=(ib0-SplitBracketMin); ib<(ib0+SplitBracketMin); ib++) {
                    if (bq[ib]<SplitBaseQmin) {
                        split0=false;
                        break;
                    }
                }
            }
            
            int c1=bampair.BamEnd[1].clip[0]+bampair.BamEnd[1].clip[1];
            LR=bampair.BamEnd[1].b.core.l_qseq;
            bool split1=((LR-c0)>SplitBracketMin)&(c1>SplitBracketMin);;
            int ib1=0;
            if ((split1)&(bampair.BamEnd[1].clip[0]>SplitBracketMin)) {
                ib1=bampair.BamEnd[1].clip[0];
            } else if ((split1)&(bampair.BamEnd[1].clip[1]>SplitBracketMin) ) {
                ib1=LR-bampair.BamEnd[1].clip[1];
            }
            split1=split1&(ib1>0);
            if (split1) {
                uint8_t*  bq=bam1_qual(&(bampair.BamEnd[1].b));
                for (int ib=(ib1-SplitBracketMin); ib<(ib1+SplitBracketMin); ib++) {
                    if (bq[ib]<SplitBaseQmin) {
                        split1=false;
                        break;
                    }
                }
            }
            bool UP=(split0|split1)&((c1+c0)>minLR);
            if (UP && (fpUP!=0)) {
                Nout++;
                int s1=samwrite(fpUP, &(bampair.BamEnd[0].b));
                int s2=samwrite(fpUP, &(bampair.BamEnd[1].b));
                if ((s1*s2)>0) {
                    continue;
                } else {
                    cerr << "bad write to uPart.bam" << endl;
                    exit(154);
                }
            }
        }
        if ( (outUniqueUnmappedBam) & ((bampair.BamEnd[0].q>=Qmin)|(bampair.BamEnd[1].q>=Qmin)) ) {
            bool z0=((bampair.BamEnd[0].b.core.flag&BAM_FUNMAP)>0);
            bool z1=((bampair.BamEnd[1].b.core.flag&BAM_FUNMAP)>0);  
            
            
            uint8_t*  bq=bam1_qual(&(bampair.BamEnd[0].b));
            for (int nb,ib=0; ib<bampair.BamEnd[0].b.core.l_qseq; ib++) {
                if (bq[ib]<SplitBaseQmin) {
                    nb++;
                }
            }

            
            bool UZ=(z0|z1)&(!(z1&z0));
            if (UZ && (fpUZ!=0)) {
                Nout++;
                int s1=samwrite(fpUZ, &(bampair.BamEnd[0].b));
                int s2=samwrite(fpUZ, &(bampair.BamEnd[1].b));
                if ((s1*s2)>0) {
                    continue;
                } else {
                    cerr << "bad write to uUnmapped.bam" << endl;
                    exit(155);
                }
            }
        }
        
        
        //cout<< bampair.Orientation << "\t"<< bampair.FragmentLength << "\t" <<bampair.BamEnd[1].b.core.pos << endl;
        
    }
    
    if (outReadPairPosBam) {
        samclose(fpWP);
    } else {        
        if (outAllPairsBam) {
            samclose(fpAP);
        } else {       
            samclose(fpFT);
            samclose(fpIC);    
            samclose(fpUP);    
            samclose(fpUM);
            samclose(fpUZ);
        }
    }
    
    /*
     for (ioutputBam=outputBam.begin(); ioutputBam!=outputBam.end(); ioutputBam++) {
        (*ioutputBam).second.close();
    }
     
    if (FragmentTailPercent>0) 
        outputBam["FT"].close();
    */
    
    samclose(Bam.fp);
    
    
}
Esempio n. 14
0
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
	extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
	extern void bcf_call_del_rghash(void *rghash);
	mplp_aux_t **data;
	int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
	const bam_pileup1_t **plp;
	bam_mplp_t iter;
	bam_header_t *h = 0;
	char *ref;
	void *rghash = 0;

	bcf_callaux_t *bca = 0;
	bcf_callret1_t *bcr = 0;
	bcf_call_t bc;
	bcf_t *bp = 0;
	bcf_hdr_t *bh = 0;

	bam_sample_t *sm = 0;
	kstring_t buf;
	mplp_pileup_t gplp;

	memset(&gplp, 0, sizeof(mplp_pileup_t));
	memset(&buf, 0, sizeof(kstring_t));
	memset(&bc, 0, sizeof(bcf_call_t));
	data = calloc(n, sizeof(void*));
	plp = calloc(n, sizeof(void*));
	n_plp = calloc(n, sizeof(int*));
	sm = bam_smpl_init();

	// read the header and initialize data
	for (i = 0; i < n; ++i) {
		bam_header_t *h_tmp;
		data[i] = calloc(1, sizeof(mplp_aux_t));
		data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
		data[i]->conf = conf;
		h_tmp = bam_header_read(data[i]->fp);
		data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
		bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
		rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
		if (conf->reg) {
			int beg, end;
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			if (idx == 0) {
				fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
				fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
			data[i]->iter = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
		if (i == 0) h = h_tmp;
		else {
			// FIXME: to check consistency
			bam_header_destroy(h_tmp);
		}
	}
	gplp.n = sm->n;
	gplp.n_plp = calloc(sm->n, sizeof(int));
	gplp.m_plp = calloc(sm->n, sizeof(int));
	gplp.plp = calloc(sm->n, sizeof(void*));

	fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
	// write the VCF header
	if (conf->flag & MPLP_GLF) {
		kstring_t s;
		bh = calloc(1, sizeof(bcf_hdr_t));
		s.l = s.m = 0; s.s = 0;
		bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
		for (i = 0; i < h->n_targets; ++i) {
			kputs(h->target_name[i], &s);
			kputc('\0', &s);
		}
		bh->l_nm = s.l;
		bh->name = malloc(s.l);
		memcpy(bh->name, s.s, s.l);
		s.l = 0;
		for (i = 0; i < sm->n; ++i) {
			kputs(sm->smpl[i], &s); kputc('\0', &s);
		}
		bh->l_smpl = s.l;
		bh->sname = malloc(s.l);
		memcpy(bh->sname, s.s, s.l);
		bh->txt = malloc(strlen(BAM_VERSION) + 64);
		bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
		free(s.s);
		bcf_hdr_sync(bh);
		bcf_hdr_write(bp, bh);
		bca = bcf_call_init(-1., conf->min_baseQ);
		bcr = calloc(sm->n, sizeof(bcf_callret1_t));
		bca->rghash = rghash;
		bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
		bca->min_frac = conf->min_frac;
		bca->min_support = conf->min_support;
	}
	if (tid0 >= 0 && conf->fai) { // region is set
		ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
		ref_tid = tid0;
		for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
	} else ref_tid = -1, ref = 0;
	iter = bam_mplp_init(n, mplp_func, (void**)data);
	max_depth = conf->max_depth;
	if (max_depth * sm->n > 1<<20)
		fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
	if (max_depth * sm->n < 8000) {
		max_depth = 8000 / sm->n;
		fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
	}
	max_indel_depth = conf->max_indel_depth * sm->n;
	bam_mplp_set_maxcnt(iter, max_depth);


	int storeSize = 100;

	int delStore[2][100] = {{0},{0}};

	typedef char * mstring;

	while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
		if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
		if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
		if (tid != ref_tid) {
			free(ref); ref = 0;
			if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
			for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
			ref_tid = tid;
		}
		if (conf->flag & MPLP_GLF) {
			int total_depth, _ref0, ref16;
			bcf1_t *b = calloc(1, sizeof(bcf1_t));
			for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
			group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
			_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
			ref16 = bam_nt16_table[_ref0];
			for (i = 0; i < gplp.n; ++i)
				bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
			bcf_call_combine(gplp.n, bcr, ref16, &bc);
			bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
						 (conf->flag&MPLP_FMT_SP), 0, 0);
			bcf_write(bp, bh, b);
			bcf_destroy(b);
			// call indels
			if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
				for (i = 0; i < gplp.n; ++i)
					bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
				if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
					b = calloc(1, sizeof(bcf1_t));
					bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
								 (conf->flag&MPLP_FMT_SP), bca, ref);
					bcf_write(bp, bh, b);
					bcf_destroy(b);
				}
			}
		} else {
			printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
			for (i = 0; i < n; ++i) {
				int j;
				printf("\t%d\t", n_plp[i]);
				if (n_plp[i] == 0) {
					printf("*\t*"); // FIXME: printf() is very slow...
					if (conf->flag & MPLP_PRINT_POS) printf("\t*");
				} else {
					//MDW start					
					//for each position in the pileup column
					int charLen = 16;
					int countChars[ charLen ][2];
					int countiChars[ charLen ][2];

					int countGap[2]={0,0};

					//double qvTotal=0;
					int numStruck=0;
					int numGood=0;
					int tti;
					int ttj;
					mstring insAllele[100];
					int insAlleleCnt[100];
					int sf=0;
					int flag=0;

					//typedef char * string;
					char insStr0[10000];
					int iCnt0=0;

					char insStr1[10000];
					int iCnt1=0;

					char delStr0[10000];
					int dCnt0=0;

					char delStr1[10000];
					int dCnt1=0;


					float qposP[10000];
					int qposCnt=0;



					//initialize with zeros
						for(tti=0;tti<charLen;tti++){
						  countChars[tti][0]=0;
						  countChars[tti][1]=0;
						}

					// define repeat length here; look back up to 10 prior positions
					// start one position away.
					int replC=0; //
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos-1])==toupper(ref[pos-tti])){
							replC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}					
					int reprC=0; // 
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos+1])==toupper(ref[pos+tti])){
							reprC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}		
					int repT = replC;
					if(replC < reprC){
						repT=reprC;
					}



					for (j = 0; j < n_plp[i]; ++j){
						const bam_pileup1_t *p = plp[i] + j;
									
						/*
						SAME LOGIC AS pileup_seq()
						*/

						if(p->is_refskip){ // never count intron gaps in numStruck
							continue;
						}

						if(p->is_del){ // skip deletion gap, after first position which is the first aligned char
							continue;
						}

						if( 	p->b->core.qual < conf->min_mqToCount  || // mapping quality
							conf->maxrepC < (repT) || // max homopolymer run, this will not 
							(!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches
							p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) ||  // trimEnd is 1-based
							p->zf == 1 || // fusion tag
							p->ih > conf->maxIH  || // max hit index
							(p->nmd > conf->maxNM) || // max mismatch
							(conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs
							(conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary
							(conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup
							(conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY))  || // optionally strike secondary or dup
							(conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) ))   // optionally strike secondary, dup and QCfail


						){
							numStruck++;
							continue;
						}

						
						//printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]);


						if(!p->is_del && p->indel==0){
  						  countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++;
						  numGood++;			

						}else if(p->is_refskip){
						  countGap[ bam1_strand(p->b) ]++;
						}
						
						if(p->indel<0){
    						  numGood++;			
						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr0[dCnt0] =  ref[pos+tti];
							   dCnt0++;
							  }	
							  delStr0[dCnt0] = ',';
							  dCnt0++;
						  }else{
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr1[dCnt1] = ref[pos+tti];
							   dCnt1++;
							  }	
							  delStr1[dCnt1] = ',';
							  dCnt1++;
						  }



						}else if(p->indel>0){
						  numGood++;			

						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt0++;
							  }	
							  insStr0[iCnt0] = ',';
							  iCnt0++;
						  }else{
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt1++;
							  }	
							  insStr1[iCnt1] = ',';
							  iCnt1++;
						  }


						}
						//calculate position of variant within aligned read - no soft clips
						if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0  ){

						//distance to end; calculate distance to end of aligned read.  removes soft clips.
						int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd;
						qposP[qposCnt] = distToEnd;						  
						qposCnt++;	
						// printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd);
						}	
					}

					//

					//print A,C,G,T, by +/-
				        printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", 	countChars[1][0],countChars[1][1],
											countChars[2][0],countChars[2][1],
											countChars[4][0],countChars[4][1],
											countChars[8][0],countChars[8][1],
											countChars[7][0],countChars[7][1]);
					
					putchar('\t');
					for(tti=0;tti<dCnt0;tti++){
					  putchar(delStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<dCnt1;tti++){
					  putchar(delStr1[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt0;tti++){
					  putchar(insStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt1;tti++){
					  putchar(insStr1[tti]);
					}

					printf("\t%d\t%d",numGood,numStruck);					

					// get non-ref qpos variation

					float medqpos = -1;
					float medAbsDev = -1;
					if(qposCnt>0){
					  medqpos = median(qposCnt,qposP);
					  float absDev[qposCnt];
					  for(tti=0;tti<qposCnt;tti++){
						absDev[tti] = abs(medqpos - qposP[tti]);
					  }
					  medAbsDev = median(qposCnt-1,absDev);
					}
					printf("\t%f",medAbsDev);

					///END MDW
				}



			}
			putchar('\n');
		}
	}

	bcf_close(bp);
	bam_smpl_destroy(sm); free(buf.s);
	for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
	free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
	bcf_call_del_rghash(rghash);
	bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
	bam_mplp_destroy(iter);
	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(plp); free(ref); free(n_plp);
	return 0;
}
Esempio n. 15
0
int main_ld(int argc, char *argv[])
{
    int chr;                  //! chromosome identifier
    int beg;                  //! beginning coordinate for analysis
    int end;                  //! end coordinate for analysis
    int ref;                  //! ref
    long num_windows;         //! number of windows
    std::string msg;          //! string for error message
    bam_plbuf_t *buf;         //! pileup buffer
    ldData t;

    // parse the command line options
    std::string region = t.parseCommandLine(argc, argv);

    // check input BAM file for errors
    t.checkBAM();

    // initialize the sample data structure
    t.bam_smpl_init();

    // add samples
    t.bam_smpl_add();

    // initialize error model
    t.em = errmod_init(1.0-0.83);

    // parse genomic region
    int k = bam_parse_region(t.h, region, &chr, &beg, &end);
    if (k < 0)
    {
        msg = "Bad genome coordinates: " + region;
        fatal_error(msg, __FILE__, __LINE__, 0);
    }

    // fetch reference sequence
    t.ref_base = faidx_fetch_seq(t.fai_file, t.h->target_name[chr], 0, 0x7fffffff, &(t.len));

    // calculate the number of windows
    if (t.flag & BAM_WINDOW)
        num_windows = ((end-beg)-1)/t.win_size;
    else
    {
        t.win_size = (end-beg);
        num_windows = 1;
    }

    // iterate through all windows along specified genomic region
    for (long cw=0; cw < num_windows; cw++)
    {

        // construct genome coordinate string
        std::string scaffold_name(t.h->target_name[chr]);
        std::ostringstream winc(scaffold_name);
        winc.seekp(0, std::ios::end);
        winc << ":" << beg+(cw*t.win_size)+1 << "-" << ((cw+1)*t.win_size)+(beg-1);
        std::string winCoord = winc.str();

        // initialize number of sites to zero
        t.num_sites = 0;

        // parse the BAM file and check if region is retrieved from the reference
        if (t.flag & BAM_WINDOW)
        {
            k = bam_parse_region(t.h, winCoord, &ref, &(t.beg), &(t.end));
            if (k < 0)
            {
                msg = "Bad window coordinates " + winCoord;
                fatal_error(msg, __FILE__, __LINE__, 0);
            }
        }
        else
        {
            ref = chr;
            t.beg = beg;
            t.end = end;
            if (ref < 0)
            {
                msg = "Bad scaffold name: " + region;
                fatal_error(msg, __FILE__, __LINE__, 0);
            }
        }

        // initialize nucdiv variables
        t.init_ld();

        // create population assignments
        t.assign_pops();

        // initialize pileup
        buf = bam_plbuf_init(make_ld, &t);

        // fetch region from bam file
        if ((bam_fetch(t.bam_in->x.bam, t.idx, ref, t.beg, t.end, buf, fetch_func)) < 0)
        {
            msg = "Failed to retrieve region " + region + " due to corrupted BAM index file";
            fatal_error(msg, __FILE__, __LINE__, 0);
        }

        // finalize pileup
        bam_plbuf_push(0, buf);

        // calculate linkage disequilibrium statistics
        ld_func fp[3] = {&ldData::calc_zns, &ldData::calc_omegamax, &ldData::calc_wall};
        (t.*fp[t.output])();

        // print results to stdout
        t.print_ld(chr);

        // take out the garbage
        t.destroy_ld();
        bam_plbuf_destroy(buf);
    }
    // end of window interation

    errmod_destroy(t.em);
    samclose(t.bam_in);
    bam_index_destroy(t.idx);
    t.bam_smpl_destroy();
    free(t.ref_base);

    return 0;
}
Esempio n. 16
0
int main(int argc,char* argv[]) {
    time_t timestamp, current;
    int i,j,k;
    int a,n;
    char *pc;

    FILE *input_file;
    FILE *output_file;
    FILE* log_file=stderr;

    bamFile bam_input;
    bam_header_t *header;
    bam1_t* b;
    bam1_core_t *c;


    char cps_file_name[MAXFILEBUFFLENGTH]="";
    char bam_file_name[MAXFILEBUFFLENGTH]="";
    char out_file_name[MAXFILEBUFFLENGTH]="";
    char log_file_name[MAXFILEBUFFLENGTH]="";

    char buff[MAXFILEBUFFLENGTH];
    char chr[MAXFILEBUFFLENGTH];
    int beg, beg_prev, end, pos, offset; 
    int ref_id, ref_id_prev, label;
    int s, side;
    int read_type, mapped_strand;
    char ch;

    int limit_counts = 0;

    int* contig_count[2];
    int* contig_index[2];
    splice_site** contig_sites[2];

    long int n_reads[N_READ_TYPES][2];

    long int n_total_reads = 0;
    long int n_skipped_reads = 0;

    int max_intron_length=0;
    int min_intron_length=0;
    int ignore_gene_labels = 0;
    int stranded = 1;
    int rev_compl[2] = {1,0};

    int other_end, the_end, donor_id, acceptor_id;

    int *cigar;
    int flagged = 0;
    int margin = 4;


    /** reading input from the command line **/

    timestamp = time(NULL);

    if(argc==1) {
	fprintf(stderr, "BAM2SSJ is the utility for fast counting reads covering splice junctions\nCommand line use:\n");
        fprintf(stderr, "%s -cps <cps_file> -bam <bam_file> [-out <out_file>] [-log <log_file>] [-maxlen <max_intron_length>] [-minlen <min_intron_length>] [-margin <length>] ",argv[0]);
	fprintf(stderr, "[-v suppress verbose output] [-read1 0/1] [-read2 0/1] [-g ignore gene labels] [-u unstranded] [-f count reads flagged 0x800 only]\ntype %s -h for more info\n",argv[0]);
        exit(1);
    }

    for(i=1;i<argc;i++) {
        pc = argv[i];
        if(*pc == '-') {
            if(strcmp(pc+1,"cps") == 0) sscanf(argv[++i], "%s", &cps_file_name[0]);
	    if(strcmp(pc+1,"bam") == 0) sscanf(argv[++i], "%s", &bam_file_name[0]);
	    if(strcmp(pc+1,"out") == 0) sscanf(argv[++i], "%s", &out_file_name[0]);
            if(strcmp(pc+1,"log") == 0) sscanf(argv[++i], "%s", &log_file_name[0]);

            if(strcmp(pc+1,"read1") == 0) sscanf(argv[++i], "%i", &rev_compl[0]);
            if(strcmp(pc+1,"read2") == 0) sscanf(argv[++i], "%i", &rev_compl[1]);

	    if(strcmp(pc+1,"lim") == 0) sscanf(argv[++i], "%i", &limit_counts);
	    if(strcmp(pc+1,"minlen") == 0) sscanf(argv[++i], "%i", &min_intron_length);
	    if(strcmp(pc+1,"maxlen") == 0) sscanf(argv[++i], "%i", &max_intron_length);
	    if(strcmp(pc+1,"margin") == 0) sscanf(argv[++i], "%i", &margin);

	    if(strcmp(pc+1,"v") == 0) verbose = 0;
	    if(strcmp(pc+1,"g") == 0) ignore_gene_labels = 1;
	    if(strcmp(pc+1,"u") == 0) stranded = 0;
	    if(strcmp(pc+1,"f") == 0) flagged = 1;

	    if(strcmp(pc+1,"h") ==0 ) {
		fprintf(stderr, "Input:  (1) sorted BAM file\n");
		fprintf(stderr, "\t(2) CPS (chromosome-position-strand) tab-delimited file sorted by position (chr1 100 + etc)\n\n");
        	fprintf(stderr, "\tIn order to get CPS file from gtf, use the utility gtf2cps.sh\n");
        	fprintf(stderr, "\tImportant: CPS must be sorted by position ONLY!\n\n");
        	fprintf(stderr, "\tIf the 4th column contains (a numeric) gene label then only splice junctions within the same gene will be considered (unless the '-g' option is active)\n");
		fprintf(stderr, "\tThe utility to generate CPS with gene labels is gtf2cps_with_gene_id.sh (or update the script accordingly if you are using genome other than human)\n\n");
		fprintf(stderr, "Options:\n");
        	fprintf(stderr, "\t-maxlen <upper limit on intron length>; 0 = no limit (default=%i)\n",max_intron_length);
		fprintf(stderr, "\t-minlen <lower limit on intron length>; 0 = no limit (default=%i)\n",min_intron_length);
		fprintf(stderr, "\t-margin <length> minimum number of flanking nucleotides in the read in order to support SJ or cover EB, (default=%i)\n",margin);
        	fprintf(stderr, "\t-read1 0/1, reverse complement read1 no/yes (default=%i)\n",rev_compl[0]);
        	fprintf(stderr, "\t-read2 0/1, reverse complement read2 no/yes (default=%i)\n",rev_compl[1]);
        	fprintf(stderr, "\t-g ignore gene labels (column 4 of cps), default=%s\n", ignore_gene_labels ? "ON" : "OFF");
        	fprintf(stderr, "\t-u ignore strand (all reads map to the correct strand), default=%s\n", stranded ? "OFF" : "ON");
		fprintf(stderr, "\t-f count only reads that are flagged 0x800 (uniquely mapped reads), default=%s\n", flagged ? "ON" : "OFF");
		fprintf(stderr, "Output: tab-delimited  (default=stdout)\n");
        	fprintf(stderr, "\tColumn 1 is splice_junction_id\n");
        	fprintf(stderr, "\tColumns 2-6 are counts of 53, 5X, X3, 50, and 03 reads for the correct (annotated) strand\n");
        	fprintf(stderr, "\tColumns 7-11 are similar counts for the incorrect (opposite to annotated) strand\n");
		fprintf(stderr, "Descriptive read statistics are reported to stderr\n");
		exit(1);
	    }
	}
    }

    if(log_file_name[0]==0) {
	log_file = stderr;
    }
    else {
	log_file = fopen(log_file_name,"w");
	if(log_file == NULL) log_file = stderr;
    }

    if(bam_file_name[0]==0) {
	fprintf(log_file,"Bam not specified, exiting\n");
	exit(1); 
    }

    if(cps_file_name[0]==0) {
        fprintf(log_file,"Input not specified, exiting\n");
        exit(1);
    }

    if(out_file_name[0]==0) {
	fprintf(log_file,"[Warning: output set to stdout]\n");
	output_file = stdout;
    }
    else {
	output_file = fopen(out_file_name,"w");
	if(output_file == NULL) {
	    fprintf(log_file,"[Warning: output set to stdout]\n");
            output_file = stdout;
	}
    }

    if(max_intron_length>0) {
	if(verbose) fprintf(log_file,"[Warning: set max intron length=%i]\n",max_intron_length);
    }

    if(ignore_gene_labels) {
	if(verbose) fprintf(log_file,"[Warning: ignoring gene labels (column 4)]\n");
    }

    if(flagged) {
	if(verbose) fprintf(log_file,"[Warning: only look at reads flagged 0x800]\n");
    }

    if(margin>0) {
	if(verbose) fprintf(log_file,"[Warning: read margin set to %i]\n", margin);
    }

    if(verbose) {
	for(s = 0; s < 2; s++) if(rev_compl[s]) fprintf(log_file,"[Warning: take reverse complement of read %i]\n", s+1);
	fprintf(log_file,"[Warning: stranded = %s]\n", stranded ? "TRUE" : "FALSE (always correct strand)");
	if(ignore_gene_labels) fprintf(log_file,"[Warning: ignore gene labels (column 4)]\n");
    }


    for(i = 0; i < N_READ_TYPES; i++) for(s = 0; s < 2; s++) n_reads[i][s] = 0;

    /** initatializing BAM and header **/
   
    bam_input = bam_open(bam_file_name, "r");
    header = bam_header_read(bam_input);

    if(bam_input == NULL || header == NULL) {
        fprintf(log_file,"BAM can't be opened or contains no header, exiting\n");
        exit(1);
    }

    /** reading input from CPS **/

    input_file = fopen(cps_file_name, "r");
    if(input_file == NULL) {
	fprintf(log_file,"CPS can't be opened, exiting\n");
        exit(1);
    }

    /** populating gene structure arrays **/

    for(s = 0; s < 2; s++) {
    	contig_count[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN));
    	contig_index[s] = (int*) malloc(sizeof(int) * (header->n_targets + ARRAY_MARGIN));
    	contig_sites[s] = (splice_site**) malloc(sizeof(splice_site*) * (header->n_targets + ARRAY_MARGIN));

    	if(contig_count[s] == NULL || contig_sites[s] == NULL || contig_index[s] == NULL) {
	    fprintf(log_file, "Not enought memory, exiting\n");
            exit(1);
    	}
    }

    for(s = 0; s < 2; s++)
        for(i=0; i < header->n_targets; i++) 
	    contig_count[s][i] = contig_index[s][i] = 0;

    if(verbose) fprintf(log_file, "Reading %s pass1", cps_file_name);
    while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) {
	sscanf(buff, "%s %*i %c", &chr[0], &ch);
	bam_parse_region(header, chr, &i, &beg, &end);
	s = (ch == '+' ? 0 : 1);
	if(i < header->n_targets && i>=0) contig_count[s][i]++;
    }

    for(s = 0; s < 2; s++) {
    	for(i = 0;i < header->n_targets; i++) {
	    contig_sites[s][i] = (splice_site*) malloc(sizeof(splice_site) * (contig_count[s][i] + ARRAY_MARGIN));
	    if(contig_sites[s][i] == NULL) {
	    	fprintf(log_file, "Not enought memory, exiting\n");
            	exit(1);
	    }
	}
    }
    if(verbose) fprintf(log_file, "\n");

    if(verbose) fprintf(log_file, "Reading %s pass2",cps_file_name);
    fseek(input_file, 0, SEEK_SET);
    while(fgets(buff, MAXFILEBUFFLENGTH, input_file)) {
        sscanf(buff, "%s %i %c %i", &chr[0], &pos, &ch, &label);
	bam_parse_region(header, chr, &i, &beg, &end);
	s = (ch == '+' ? 0 : 1);
	if(i < header->n_targets && i>=0) {
	    if(contig_index[s][i]>0) {
		if(pos < contig_sites[s][i][contig_index[s][i]-1].pos) {
		    fprintf(log_file, "Splice sites weren't sorted, exiting\n");
		    exit(1);
		}
	    }
	    contig_sites[s][i][contig_index[s][i]].pos = pos;
	    contig_sites[s][i][contig_index[s][i]].label = ignore_gene_labels ? 0 : label;
	    for(side = 0; side < 2; side++) {
                contig_sites[s][i][contig_index[s][i]].count00[side] = 0;
                contig_sites[s][i][contig_index[s][i]].count5X[side] = 0;
                contig_sites[s][i][contig_index[s][i]].countX3[side] = 0;
		contig_sites[s][i][contig_index[s][i]].junctions = NULL;
	    }
	    contig_index[s][i]++;
	}
    }
    if(verbose) fprintf(log_file, "\n");

    for(s = 0; s < 2; s++)
    	for(i = 0;i < header->n_targets; i++) 
	    contig_index[s][i] = 0;

    /** analysis starts here **/

    b = bam_init1();
    k = 0;
    ref_id_prev = -1;
    beg_prev = -1;
    while(bam_read1(bam_input, b)>=0) {
        c   = &b->core;
	ref_id = c->tid;
	if(ref_id<0) continue;

	if(flagged && ((c->flag & 0x800) == 0)) {
	    n_skipped_reads++;
	    continue;
	}

        if(stranded && ((c->flag & BAM_FREAD1) && (c->flag & BAM_FREAD2) || !(c->flag & BAM_FREAD1) && !(c->flag & BAM_FREAD2))) {
            n_skipped_reads++;
            continue;
        }

        cigar = bam1_cigar(b);

	if(ref_id != ref_id_prev  && ref_id_prev >= 0) {
	    if(contig_index[0][ref_id_prev] + contig_index[1][ref_id_prev] < contig_count[0][ref_id_prev] + contig_count[1][ref_id_prev]) {
		if(log_file==stderr) progressbar(1, 1, header->target_name[ref_id_prev], verbose);
	    }
	    beg_prev = -1;
	}

	/*if(ref_id < ref_id_prev) {
	    fprintf(log_file,"BAM file wasn't sorted, exiting\n");
            exit(1);
	}*/

	ref_id_prev = ref_id;

	beg = c->pos + 1;
	if(beg < beg_prev) {
	    fprintf(log_file,"BAM file wasn't sorted, exiting\n");
	    exit(1);
	}
	beg_prev = beg;

	s = ((c->flag & BAM_FREVERSE)>0);
	mapped_strand = (c->flag & BAM_FREAD1) ? (s + rev_compl[0]) & 1 : (s + rev_compl[1]) & 1;

	the_end = bam_calend(c, cigar);

	for(s = 0; s < 1 + stranded; s++) {
            end = beg;
	    side = (s == mapped_strand) ? 0 : 1;
	    side *= stranded;

	    // keep reading until the currect site is on the same chromosome downstream of the read 

	    while(contig_sites[s][ref_id][contig_index[s][ref_id]].pos < beg && contig_index[s][ref_id] < contig_count[s][ref_id]) {
		contig_index[s][ref_id]++;
	    	if(log_file==stderr) progressbar(contig_index[0][ref_id]+contig_index[1][ref_id], contig_count[0][ref_id]+contig_count[1][ref_id], header->target_name[ref_id], verbose);
	    }

	    read_type = RT_OTHER;

            if(contig_index[s][ref_id]<contig_count[s][ref_id]) {
	    	// check if the read is a split read and find its other end
	    	read_type = RT_GENOME;
            	for(i = 0; i < c->n_cigar; i++) {
	    	    offset = cigar[i] >> 4;
	    	    switch(cigar[i] & 0x0F) {
		    	case BAM_CMATCH: 	end += offset;  // match to the reference
					 	break;
		    	case BAM_CINS:		end += 0;	// insertion to the reference, pointer stays unchanged
						break;
		    	case BAM_CDEL:		end += offset;	// deletion from the reference (technically the same as 'N') pointer moves
						break; 
		    	case BAM_CREF_SKIP:	other_end = end + offset;
						donor_id = acceptor_id = -INFTY;
						if(end - beg < margin) break;
						if(the_end - other_end < margin) break;
						for(j = contig_index[s][ref_id]; contig_sites[s][ref_id][j].pos <= other_end && j < contig_count[s][ref_id];j++) {
						    if(contig_sites[s][ref_id][j].pos - end < min_intron_length && min_intron_length > 0) continue;
						    if(contig_sites[s][ref_id][j].pos - end > max_intron_length && max_intron_length > 0) break;
					    	    if(contig_sites[s][ref_id][j].label == contig_sites[s][ref_id][contig_index[s][ref_id]].label) {
					    	    	if(contig_sites[s][ref_id][j].pos == end - 1)   donor_id = j;
					    	    	if(contig_sites[s][ref_id][j].pos == other_end) acceptor_id = j;
					    	    }
					    	}
						if(donor_id>0 && acceptor_id>0) {
					    	    update_count(&contig_sites[s][ref_id][donor_id].junctions, acceptor_id, side);
					    	    contig_sites[s][ref_id][donor_id].count5X[side]++;
                                            	    contig_sites[s][ref_id][acceptor_id].countX3[side]++;
					    	    read_type = RT_KJUNCT;
						}
						else {
					    	    read_type = RT_UJUNCT;
						}
						end = other_end;
				 		break;
		    	case BAM_CSOFT_CLIP:
		    	case BAM_CHARD_CLIP:
		    	case BAM_CPAD:		break;
		    	default:		read_type = RT_OTHER;
	    	    }
            	}

	    	if(read_type == RT_GENOME) {
	            for(j=contig_index[s][ref_id]; beg + margin <= contig_sites[s][ref_id][j].pos  && contig_sites[s][ref_id][j].pos < end - margin && j<contig_count[s][ref_id]; j++) {
		    	contig_sites[s][ref_id][j].count00[side]++;
		    	read_type = RT_OVRLAP;
		    	k++;
	    	    }
	    	}
	    }

	    n_reads[read_type][side]++;
	}
	n_total_reads++;

	if(k>limit_counts && limit_counts>0) break;

    }