Beispiel #1
0
int bam_flagstat(int argc, char *argv[])
{
	bamFile fp;
	bam_header_t *header;
	bam_flagstat_t *s;
	if (argc == optind) {
		fprintf(pysamerr, "Usage: samtools flagstat <in.bam>\n");
		return 1;
	}
	fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
	assert(fp);
	header = bam_header_read(fp);
	s = bam_flagstat_core(fp);
	printf("%lld + %lld in total (QC-passed reads + QC-failed reads)\n", s->n_reads[0], s->n_reads[1]);
	printf("%lld + %lld duplicates\n", s->n_dup[0], s->n_dup[1]);
	printf("%lld + %lld mapped (%.2f%%:%.2f%%)\n", s->n_mapped[0], s->n_mapped[1], (float)s->n_mapped[0] / s->n_reads[0] * 100.0, (float)s->n_mapped[1] / s->n_reads[1] * 100.0);
	printf("%lld + %lld paired in sequencing\n", s->n_pair_all[0], s->n_pair_all[1]);
	printf("%lld + %lld read1\n", s->n_read1[0], s->n_read1[1]);
	printf("%lld + %lld read2\n", s->n_read2[0], s->n_read2[1]);
	printf("%lld + %lld properly paired (%.2f%%:%.2f%%)\n", s->n_pair_good[0], s->n_pair_good[1], (float)s->n_pair_good[0] / s->n_pair_all[0] * 100.0, (float)s->n_pair_good[1] / s->n_pair_all[1] * 100.0);
	printf("%lld + %lld with itself and mate mapped\n", s->n_pair_map[0], s->n_pair_map[1]);
	printf("%lld + %lld singletons (%.2f%%:%.2f%%)\n", s->n_sgltn[0], s->n_sgltn[1], (float)s->n_sgltn[0] / s->n_pair_all[0] * 100.0, (float)s->n_sgltn[1] / s->n_pair_all[1] * 100.0);
	printf("%lld + %lld with mate mapped to a different chr\n", s->n_diffchr[0], s->n_diffchr[1]);
	printf("%lld + %lld with mate mapped to a different chr (mapQ>=5)\n", s->n_diffhigh[0], s->n_diffhigh[1]);
	free(s);
	bam_header_destroy(header);
	bam_close(fp);
	return 0;
}
Beispiel #2
0
samfile_t* samopen_in(quip_reader_t reader, void* reader_data, bool binary, void* aux)
{
    samfile_t *fp;
    fp = (samfile_t*)calloc(1, sizeof(samfile_t));

    fp->type |= TYPE_READ;
    if (binary) { // binary
        fp->type |= TYPE_BAM;
        fp->x.bam = bam_open_in(reader, reader_data);
        if (fp->x.bam == 0) goto open_err_ret;
        fp->header = bam_header_read(fp->x.bam);
    } else { // text
        fp->x.tamr = sam_open_in(reader, reader_data);
        if (fp->x.tamr == 0) goto open_err_ret;
        fp->header = sam_header_read(fp->x.tamr);
        if (fp->header->n_targets == 0) { // no @SQ fields
            if (aux) { // check if aux is present
                bam_header_t *textheader = fp->header;
                fp->header = sam_header_read2((const char*)aux);
                if (fp->header == 0) goto open_err_ret;
                append_header_text(fp->header, textheader->text, textheader->l_text);
                bam_header_destroy(textheader);
            }
            if (fp->header->n_targets == 0 && bam_verbose >= 1)
                fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
        } else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
    }

    return fp;

open_err_ret:
    free(fp);
    return 0;
}
Beispiel #3
0
// currently, this function ONLY works if each read has one hit
void bam_mating_core(bamFile in, bamFile out)
{
	bam_header_t *header;
	bam1_t *b[2];
	int curr, has_prev, pre_end = 0, cur_end;
	kstring_t str;

	str.l = str.m = 0; str.s = 0;
	header = bam_header_read(in);
	bam_header_write(out, header);

	b[0] = bam_init1();
	b[1] = bam_init1();
	curr = 0; has_prev = 0;
	while (bam_read1(in, b[curr]) >= 0) {
		bam1_t *cur = b[curr], *pre = b[1-curr];
		if (cur->core.tid < 0) continue;
		cur_end = bam_calend(&cur->core, bam1_cigar(cur));
		if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP;
		if (cur->core.flag & BAM_FSECONDARY) continue; // skip secondary alignments
		if (has_prev) {
			if (strcmp(bam1_qname(cur), bam1_qname(pre)) == 0) { // identical pair name
				cur->core.mtid = pre->core.tid; cur->core.mpos = pre->core.pos;
				pre->core.mtid = cur->core.tid; pre->core.mpos = cur->core.pos;
				if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
					&& !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // set TLEN/ISIZE
				{
					uint32_t cur5, pre5;
					cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
					pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
					cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
				} else cur->core.isize = pre->core.isize = 0;
				if (pre->core.flag&BAM_FREVERSE) cur->core.flag |= BAM_FMREVERSE;
				else cur->core.flag &= ~BAM_FMREVERSE;
				if (cur->core.flag&BAM_FREVERSE) pre->core.flag |= BAM_FMREVERSE;
				else pre->core.flag &= ~BAM_FMREVERSE;
				if (cur->core.flag & BAM_FUNMAP) { pre->core.flag |= BAM_FMUNMAP; pre->core.flag &= ~BAM_FPROPER_PAIR; }
				if (pre->core.flag & BAM_FUNMAP) { cur->core.flag |= BAM_FMUNMAP; cur->core.flag &= ~BAM_FPROPER_PAIR; }
				bam_template_cigar(pre, cur, &str);
				bam_write1(out, pre);
				bam_write1(out, cur);
				has_prev = 0;
			} else { // unpaired or singleton
				pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
				if (pre->core.flag & BAM_FPAIRED) {
					pre->core.flag |= BAM_FMUNMAP;
					pre->core.flag &= ~BAM_FMREVERSE & ~BAM_FPROPER_PAIR;
				}
				bam_write1(out, pre);
			}
		} else has_prev = 1;
		curr = 1 - curr;
		pre_end = cur_end;
	}
	if (has_prev) bam_write1(out, b[1-curr]);
	bam_header_destroy(header);
	bam_destroy1(b[0]);
	bam_destroy1(b[1]);
	free(str.s);
}
Beispiel #4
0
void samclose(samfile_t *fp)
{
    if (fp == 0) return;
    if (fp->header) bam_header_destroy(fp->header);
    if (fp->type & 1) bam_close(fp->x.bam);
    else if (fp->type == 2) sam_close(fp->x.tamr);
    free(fp);
}
Beispiel #5
0
samfile_t* samopen_out(quip_writer_t writer, void* writer_data, bool binary, void* aux)
{
    samfile_t *fp;
    fp = (samfile_t*)calloc(1, sizeof(samfile_t));

    fp->header = bam_header_dup((const bam_header_t*)aux);

    if (binary) { // binary
        fp->type |= TYPE_BAM;
        fp->x.bam = bam_open_out(writer, writer_data);
        if (fp->x.bam == 0) goto open_err_ret;
        bam_header_write(fp->x.bam, fp->header);

    } else { // text
        // open file

        fp->x.tamw.writer      = writer;
        fp->x.tamw.writer_data = writer_data;
        fp->type |= BAM_OFDEC<<2;

        // write header
        if (aux) {
            int i;
            bam_header_t *alt;
            // parse the header text
            alt = bam_header_init();
            alt->l_text = fp->header->l_text;
            alt->text = fp->header->text;
            sam_header_parse(alt);
            alt->l_text = 0;
            alt->text = 0;
            // check if there are @SQ lines in the header
            writer(writer_data, (uint8_t*) fp->header->text, fp->header->l_text);

            if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
                if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
                    fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
            } else { // then dump ->target_{name,len}
                char target_len_str[11];
                for (i = 0; i < fp->header->n_targets; ++i) {
                    writer(writer_data, (uint8_t*) "@SQ\tSN:", 7);
                    writer(writer_data, (uint8_t*) fp->header->target_name[i], strlen(fp->header->target_name[i]));
                    writer(writer_data, (uint8_t*) "\tLN:", 4);
                    snprintf(target_len_str, sizeof(target_len_str), "%"PRIu32, fp->header->target_len[i]);
                    writer(writer_data, (uint8_t*) target_len_str, strlen(target_len_str));
                    writer(writer_data, (uint8_t*) "\n", 1);
                }
            }
            bam_header_destroy(alt);
        }
    }

    return fp;

open_err_ret:
    free(fp);
    return 0;
}
void samclose(samfile_t *fp)
{
	if (fp == 0) return;
	if (fp->header) bam_header_destroy(fp->header);
	if (fp->type & TYPE_BAM) bam_close(fp->x.bam);
	else if (fp->type & TYPE_READ) sam_close(fp->x.tamr);
	else fclose(fp->x.tamw);
	free(fp);
}
Beispiel #7
0
/* check match between reference and bam files. prints an error
 * message and return non-zero on mismatch 
*/
int checkref(char *fasta_file, char *bam_file)
{
     int i = -1;
     bam_header_t *header;
     faidx_t *fai;
     char *ref;
     int ref_len = -1;
     bamFile bam_fp;
     
     if (! file_exists(fasta_file)) {
          LOG_FATAL("Fsata file %s does not exist. Exiting...\n", fasta_file);
          return 1;
     }     

     if (0 != strcmp(bam_file, "-")  && ! file_exists(bam_file)) {
          LOG_FATAL("BAM file %s does not exist. Exiting...\n", bam_file);
          return 1;
     }     

     bam_fp = strcmp(bam_file, "-") == 0 ? bam_dopen(fileno(stdin), "r") : bam_open(bam_file, "r");
     header = bam_header_read(bam_fp);
     if (!header) {
          LOG_FATAL("Failed to read BAM header from %s\n", bam_file);
          return 1;
     }
     
     fai = fai_load(fasta_file);
     if (!fai) {
          LOG_FATAL("Failed to fasta index for %s\n", fasta_file);
          return 1;
     }
     
     for (i=0; i < header->n_targets; i++) {
          LOG_DEBUG("BAM header target %d of %d: name=%s len=%d\n", 
                    i+1, header->n_targets, header->target_name[i], header->target_len[i]);
          
          ref = faidx_fetch_seq(fai, header->target_name[i], 
                                0, 0x7fffffff, &ref_len);
          if (NULL == ref) {
               LOG_FATAL("Failed to fetch sequence %s from fasta file\n", header->target_name[i]);
               return -1;
          }
          if (header->target_len[i] != ref_len) {
               LOG_FATAL("Sequence length mismatch for sequence %s (%dbp in fasta; %dbp in bam)\n", 
                         header->target_name[i], header->target_len[i], ref_len);
               return -1;
          }
          free(ref);
     }
     
     fai_destroy(fai);
     bam_header_destroy(header);
     bam_close(bam_fp);

     return 0;
}
Beispiel #8
0
void TGM_BamHeaderFree(TGM_BamHeader* pBamHeader)
{
    if (pBamHeader != NULL)
    {
        free(pBamHeader->pMD5s);
        bam_header_destroy(pBamHeader->pOrigHeader);

        free(pBamHeader);
    }
}
Beispiel #9
0
int bam_pad2unpad(bamFile in, bamFile out)
{
	bam_header_t *h;
	bam1_t *b;
	kstring_t r, q;
	uint32_t *cigar2 = 0;
	int n2 = 0, m2 = 0, *posmap = 0;

	h = bam_header_read(in);
	bam_header_write(out, h);
	b = bam_init1();
	r.l = r.m = q.l = q.m = 0; r.s = q.s = 0;
	while (bam_read1(in, b) >= 0) {
		uint32_t *cigar = bam1_cigar(b);
		n2 = 0;
		if (b->core.pos == 0 && b->core.tid >= 0 && strcmp(bam1_qname(b), h->target_name[b->core.tid]) == 0) {
			int i, k;
			unpad_seq(b, &r);
			write_cigar(cigar2, n2, m2, bam_cigar_gen(b->core.l_qseq, BAM_CMATCH));
			replace_cigar(b, n2, cigar2);
			posmap = realloc(posmap, r.m * sizeof(int));
			for (i = k = 0; i < r.l; ++i) {
				posmap[i] = k; // note that a read should NOT start at a padding
				if (r.s[i]) ++k;
			}
		} else {
			int i, k, op;
			unpad_seq(b, &q);
			if (bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[0]);
			for (i = 0, k = b->core.pos; i < q.l; ++i, ++k)
				q.s[i] = q.s[i]? (r.s[k]? BAM_CMATCH : BAM_CINS) : (r.s[k]? BAM_CDEL : BAM_CPAD);
			for (i = k = 1, op = q.s[0]; i < q.l; ++i) {
				if (op != q.s[i]) {
					write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
					op = q.s[i]; k = 1;
				} else ++k;
			}
			write_cigar(cigar2, n2, m2, bam_cigar_gen(k, op));
			if (bam_cigar_op(cigar[b->core.n_cigar-1]) == BAM_CSOFT_CLIP) write_cigar(cigar2, n2, m2, cigar[b->core.n_cigar-1]);
			for (i = 2; i < n2; ++i)
				if (bam_cigar_op(cigar2[i]) == BAM_CMATCH && bam_cigar_op(cigar2[i-1]) == BAM_CPAD && bam_cigar_op(cigar2[i-2]) == BAM_CMATCH)
					cigar2[i] += cigar2[i-2], cigar2[i-2] = cigar2[i-1] = 0;
			for (i = k = 0; i < n2; ++i)
				if (cigar2[i]) cigar2[k++] = cigar2[i];
			n2 = k;
			replace_cigar(b, n2, cigar2);
			b->core.pos = posmap[b->core.pos];
		}
		bam_write1(out, b);
	}
	free(r.s); free(q.s); free(posmap);
	bam_destroy1(b);
	bam_header_destroy(h);
	return 0;
}
Beispiel #10
0
bwa_seqio_t *bwa_bam_open(const char *fn, int which) {
	bwa_seqio_t *bs;
	bam_header_t *h;
	bs = (bwa_seqio_t*) calloc(1, sizeof(bwa_seqio_t));
	bs->is_bam = 1;
	bs->which = which;
	bs->fp = bam_open(fn, "r");
	h = bam_header_read(bs->fp);
	bam_header_destroy(h);
	return bs;
}
Beispiel #11
0
int main(int argc, char** argv)
{
    if(argc < 3) {
        printf("No input nor output files provided");
        return -1;
    }

    bamFile in = bam_open(argv[1], "r");
    bam_header_t* header;
    if (in == NULL) {
        printf("opening input file failed");
        return -1;
    }

    bam1_t* b = bam_init1();

    bamFile out = bam_open(argv[2], "w");
    if (out == NULL) {
        printf("opening input file failed");
        return -1;
    }

    header = bam_header_read(in);
    if(bam_header_write(out, header) < 0) {
        printf("writing header failed");
    }

    long nextPrunedId;
    if(!scanf ("%lu", &nextPrunedId)) {
        printf("warning: no ids provided");
        return -1;
    }
    long id = 0;
    while (bam_read1(in, b) >= 0) {
        // write BAM back
        if (nextPrunedId != id++) {
            bam_write1(out, b);
        } else {
            // fprintf(stderr, "pruning: id: %lu, pos: %d, length: %d\n", nextPrunedId, b->core.pos, b->core.l_qseq);
            if(!scanf ("%lu", &nextPrunedId)) {
                break;
            }
        }
    }

    // closing all resources
    bam_header_destroy(header);
    bam_close(in);
    bam_close(out);
    bam_destroy1(b);
    return 0;
}
Beispiel #12
0
bwa_seqio_t *bwa_bam_open(const char *fn, int which)
{
	bwa_seqio_t *bs;
	bam_header_t *h;
	bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
	bs->is_bam = 1;
	bs->which = which;
	bs->fp = bam_open(fn, "r");
	if (0 == bs->fp) err_fatal_simple("Couldn't open bam file");
	h = bam_header_read(bs->fp);
	bam_header_destroy(h);
	return bs;
}
Beispiel #13
0
bwa_seqio_t *bwa_bam_open(const char *fn, int which, char **saif,
                          gap_opt_t *o0, bam_header_t **hh)
{
    int c, b=0;
    bwa_seqio_t *bs;
    bam_header_t *h;
    bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
    bs->is_bam = 1;
    bs->which = which;
    bs->fp = (fn[0]!='-' || fn[1]) ? bam_open(fn, "r") : bam_dopen(0, "r") ;
    h = bam_header_read(bs->fp);
    if(hh) *hh=h;
    else bam_header_destroy(h);

    if( saif ) for(c=0; c!=3; ++c)
        {
            gap_opt_t opt;
            if( saif[c] ) {
                bs->sai[c] = xopen(saif[c], "r");
                if( 1 > fread(&opt, sizeof(gap_opt_t), 1, bs->sai[c]) )
                {
                    fclose(bs->sai[c]);
                    bs->sai[c] = 0;
                }
                opt.n_threads=o0->n_threads;
                if(o0) {
                    if(b) {
                        opt.mode=o0->mode;
                        if( memcmp(o0, &opt, sizeof(gap_opt_t)) ) {
                            fprintf( stderr, "[bwa_bam_open] options from sai file \"%s\" conflict with others.\n", saif[c] ) ;
                            exit(1);
                        }
                        fprintf( stderr, "[bwa_bam_open] options from sai file \"%s\" match.\n", saif[c] ) ;
                    }
                    else {
                        fprintf( stderr, "[bwa_bam_open] recovered options from sai file \"%s\".\n", saif[c] ) ;
                        memcpy(o0, &opt, sizeof(gap_opt_t));
                        b=1;
                    }
                }
            }
        }
    return bs;
}
Beispiel #14
0
void convert_sam_to_bam(char* sam_input, char* bam_input) {
    bam1_t* bam_p = bam_init1();

    LOG_DEBUG("CONVERT-START: sam to bam\n");

    //open SAM file for read
    if (time_flag) {
        start_timer(t1_convert);
    }
    tamFile sam_fd = sam_open(sam_input);

    //open BAM file for write
    bam_file_t* bam_file_p =  bam_fopen_mode(bam_input, NULL, "w");

    //read header from SAM file
    bam_header_t* bam_header_p = sam_header_read(sam_fd);

    //write header to BAM file
    bam_header_write(bam_file_p->bam_fd, bam_header_p);

    //write alignments to BAM file
    while (sam_read1(sam_fd, bam_header_p, bam_p) > 0) {
        bam_write1(bam_file_p->bam_fd, bam_p);
        num_alignments++;
    }

    //close BAM and SAM files, free bam alignment and bam file object
    bam_fclose(bam_file_p);
    sam_close(sam_fd);
    bam_header_destroy(bam_header_p);
    bam_destroy1(bam_p);
    if (time_flag) {
        stop_timer(t1_convert, t2_convert, convert_time);
    }

    //number_of_batchs = 1, convention value for statistics (not real batch)
    number_of_batchs = 1;
}
Beispiel #15
0
bwa_seqio_t *bwa_bam_open(const char *fn, int which)
{
	bwa_seqio_t *bs;
#ifndef USE_HTSLIB
	bam_header_t *h;
#endif
	bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
	bs->is_bam = 1;
	bs->which = which;
#ifdef USE_HTSLIB
	bs->fp = sam_open(fn, "rb");
#else
	bs->fp = bam_open(fn, "r");
#endif
	if (0 == bs->fp) err_fatal_simple("Couldn't open bam file");
#ifdef USE_HTSLIB
	bs->h = sam_hdr_read(bs->fp);
#else
	h = bam_header_read(bs->fp);
	bam_header_destroy(h);
#endif
	return bs;
}
Beispiel #16
0
int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
{
    BGZF *fp;
    uint8_t *buf;
    uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
    const int es=BGZF_EMPTY_BLOCK_SIZE;
    int i;

    fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");
    if (fp == 0) {
        fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);
        return 1;
    }
    if (h) bam_header_write(fp, h);

    buf = (uint8_t*) malloc(BUF_SIZE);
    for(i = 0; i < nfn; ++i){
        BGZF *in;
        bam_header_t *old;
        int len,j;

        in = strcmp(fn[i], "-")? bgzf_open(fn[i], "r") : bgzf_fdopen(fileno(stdin), "r");
        if (in == 0) {
            fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);
            return -1;
        }
        if (in->is_write) return -1;

        old = bam_header_read(in);
        if (h == 0 && i == 0) bam_header_write(fp, old);

        if (in->block_offset < in->block_length) {
            bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
            bgzf_flush(fp);
        }

        j=0;
        while ((len = bgzf_raw_read(in, buf, BUF_SIZE)) > 0) {
            if(len<es){
                int diff=es-len;
                if(j==0) {
                    fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);
                    return -1;
                }
                bgzf_raw_write(fp, ebuf, len);
                memcpy(ebuf,ebuf+len,diff);
                memcpy(ebuf+diff,buf,len);
            } else {
                if(j!=0) bgzf_raw_write(fp, ebuf, es);
                len-= es;
                memcpy(ebuf,buf+len,es);
                bgzf_raw_write(fp, buf, len);
            }
            j=1;
        }

        /* check final gzip block */
        {
            const uint8_t gzip1=ebuf[0];
            const uint8_t gzip2=ebuf[1];
            const uint32_t isize=*((uint32_t*)(ebuf+es-4));
            if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
                fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);
                fprintf(stderr, " Possible output corruption.\n");
                bgzf_raw_write(fp, ebuf, es);
            }
        }
        bam_header_destroy(old);
        bgzf_close(in);
    }
    free(buf);
    bgzf_close(fp);
    return 0;
}
Beispiel #17
0
int main_depth(int argc, char *argv[])
#endif
{
	int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0;
	const bam_pileup1_t **plp;
	char *reg = 0; // specified region
	void *bed = 0; // BED data structure
	bam_header_t *h = 0; // BAM header of the 1st input
	aux_t **data;
	bam_mplp_t mplp;

	// parse the command line
	while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) {
		switch (n) {
			case 'r': reg = strdup(optarg); break;   // parsing a region requires a BAM header
			case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
			case 'q': baseQ = atoi(optarg); break;   // base quality threshold
			case 'Q': mapQ = atoi(optarg); break;    // mapping quality threshold
		}
	}
	if (optind == argc) {
		fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n");
		return 1;
	}

	// initialize the auxiliary data structures
	n = argc - optind; // the number of BAMs on the command line
	data = (aux_t **) calloc(n, sizeof(void*)); // data[i] for the i-th input
	beg = 0; end = 1<<30; tid = -1;  // set the default region
	for (i = 0; i < n; ++i) {
		bam_header_t *htmp;
		data[i] = (aux_t *) calloc(1, sizeof(aux_t));
		data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
		data[i]->min_mapQ = mapQ;                    // set the mapQ filter
		htmp = bam_header_read(data[i]->fp);         // read the BAM header
		if (i == 0) {
			h = htmp; // keep the header of the 1st BAM
			if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region
		} else bam_header_destroy(htmp); // if not the 1st BAM, trash the header
		if (tid >= 0) { // if a region is specified and parsed successfully
			bam_index_t *idx = bam_index_load(argv[optind+i]);  // load the index
			data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator
			bam_index_destroy(idx); // the index is not needed any more; phase out of the memory
		}
	}

	// the core multi-pileup loop
	mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
	n_plp = (int*) calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
	plp = (bam_pileup1_t **) calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
	while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
		if (pos < beg || pos >= end) continue; // out of range; skip
		if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
		fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
		for (i = 0; i < n; ++i) { // base level filters have to go here
			int j, m = 0;
			for (j = 0; j < n_plp[i]; ++j) {
				const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
				if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
				else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
			}
			printf("\t%d", n_plp[i] - m); // this the depth to output
		}
		putchar('\n');
	}
	free(n_plp); free(plp);
	bam_mplp_destroy(mplp);

	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(reg);
	if (bed) bed_destroy(bed);
	return 0;
}
Beispiel #18
0
int main_depth(int argc, char *argv[])
#endif
{
	int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles;
	const bam_pileup1_t **plp;
	char *reg = 0; // specified region
	void *bed = 0; // BED data structure
    char *file_list = NULL, **fn = NULL;
	bam_header_t *h = 0; // BAM header of the 1st input
	aux_t **data;
	bam_mplp_t mplp;

	// parse the command line
	while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) {
		switch (n) {
			case 'l': min_len = atoi(optarg); break; // minimum query length
			case 'r': reg = strdup(optarg); break;   // parsing a region requires a BAM header
			case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
			case 'q': baseQ = atoi(optarg); break;   // base quality threshold
			case 'Q': mapQ = atoi(optarg); break;    // mapping quality threshold
			case 'f': file_list = optarg; break;
		}
	}
	if (optind == argc && !file_list) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n");
        fprintf(stderr, "Options:\n");
        fprintf(stderr, "   -b <bed>            list of positions or regions\n");
        fprintf(stderr, "   -f <list>           list of input BAM filenames, one per line [null]\n");
        fprintf(stderr, "   -l <int>            minQLen\n");
        fprintf(stderr, "   -q <int>            base quality threshold\n");
        fprintf(stderr, "   -Q <int>            mapping quality threshold\n");
        fprintf(stderr, "   -r <chr:from-to>    region\n");
        fprintf(stderr, "\n");
		return 1;
	}

	// initialize the auxiliary data structures
    if (file_list) 
    {
        if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
        n = nfiles;
        argv = fn;
        optind = 0;
    }
    else
        n = argc - optind; // the number of BAMs on the command line
	data = calloc(n, sizeof(void*)); // data[i] for the i-th input
	beg = 0; end = 1<<30; tid = -1;  // set the default region
	for (i = 0; i < n; ++i) {
		bam_header_t *htmp;
		data[i] = calloc(1, sizeof(aux_t));
		data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
		data[i]->min_mapQ = mapQ;                    // set the mapQ filter
		data[i]->min_len  = min_len;                 // set the qlen filter
		htmp = bam_header_read(data[i]->fp);         // read the BAM header
		if (i == 0) {
			h = htmp; // keep the header of the 1st BAM
			if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region
		} else bam_header_destroy(htmp); // if not the 1st BAM, trash the header
		if (tid >= 0) { // if a region is specified and parsed successfully
			bam_index_t *idx = bam_index_load(argv[optind+i]);  // load the index
			data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator
			bam_index_destroy(idx); // the index is not needed any more; phase out of the memory
		}
	}

	// the core multi-pileup loop
	mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
	bam_mplp_set_maxcnt(mplp,2147483647); // set max_depth to int max
	n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
	plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
	while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
		if (pos < beg || pos >= end) continue; // out of range; skip
		if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
		fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
		for (i = 0; i < n; ++i) { // base level filters have to go here
			int j, m = 0;
			for (j = 0; j < n_plp[i]; ++j) {
				const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
				if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
				else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
			}
			printf("\t%d", n_plp[i] - m); // this the depth to output
		}
		putchar('\n');
	}
	free(n_plp); free(plp);
	bam_mplp_destroy(mplp);

	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(reg);
	if (bed) bed_destroy(bed);
    if ( file_list )
    {
        for (i=0; i<n; i++) free(fn[i]);
        free(fn);
    }
	return 0;
}
Beispiel #19
0
int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)
{
    BGZF *fp;
    FILE* fp_file;
    uint8_t *buf;
    uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];
    const int es=BGZF_EMPTY_BLOCK_SIZE;
    int i;
    
    fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(_fileno(stdout), "w");
    if (fp == 0) {
        fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __FUNCTION__, outbam);
        return 1;
    }
    if (h) bam_header_write(fp, h);
    
    buf = (uint8_t*) malloc(BUF_SIZE);
    for(i = 0; i < nfn; ++i){
        BGZF *in;
        bam_header_t *old;
        int len,j;
        
        in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(_fileno(stdin), "r");
        if (in == 0) {
            fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __FUNCTION__, fn[i]);
            return -1;
        }
        if (in->open_mode != 'r') return -1;
        
        old = bam_header_read(in);
        if (h == 0 && i == 0) bam_header_write(fp, old);
        
        if (in->block_offset < in->block_length) {
            bgzf_write(fp, (uint8_t*)in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);
            bgzf_flush(fp);
        }
        
        j=0;
#ifdef _USE_KNETFILE
        fp_file=fp->x.fpw;
        while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) {
#else  
        fp_file=fp->file;
        while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) {
#endif
            if(len<es){
                int diff=es-len;
                if(j==0) {
                    fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __FUNCTION__, fn[i]);
                    return -1;
                }
                fwrite(ebuf, 1, len, fp_file);
                memcpy(ebuf,ebuf+len,diff);
                memcpy(ebuf+diff,buf,len);
            } else {
                if(j!=0) fwrite(ebuf, 1, es, fp_file);
                len-= es;
                memcpy(ebuf,buf+len,es);
                fwrite(buf, 1, len, fp_file);
            }
            j=1;
        }

        /* check final gzip block */
        {
            const uint8_t gzip1=ebuf[0];
            const uint8_t gzip2=ebuf[1];
            const uint32_t isize=*((uint32_t*)(ebuf+es-4));
            if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {
                fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __FUNCTION__, fn[i]);
                fprintf(stderr, " Possible output corruption.\n");
                fwrite(ebuf, 1, es, fp_file);
            }
        }
        bam_header_destroy(old);
        bgzf_close(in);
    }
    free(buf);
    bgzf_close(fp);
    return 0;
}



int main_cat(int argc, char *argv[])
{
    bam_header_t *h = 0;
    char *outfn = 0;
    int c, ret;
    while ((c = getopt(argc, argv, "h:o:")) >= 0) {
        switch (c) {
            case 'h': {
                tamFile fph = sam_open(optarg);
                if (fph == 0) {
                    fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __FUNCTION__, argv[1]);
                    return 1;
                }
                h = sam_header_read(fph);
                sam_close(fph);
                break;
            }
            case 'o': outfn = strdup(optarg); break;
        }
    }
    if (argc - optind < 2) {
        fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");
        return 1;
    }
    ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");
    free(outfn);
    return ret;
}
Beispiel #20
0
int
tmap_seqs_io_sff2sam_main(int argc, char *argv[])
{
  int c, help = 0;
  tmap_seqs_io_t *io_in = NULL;
  tmap_seqs_t *seqs = NULL;
  char **sam_rg = NULL;
  int32_t sam_rg_num = 0;
  int bidirectional = 0, sam_flowspace_tags = 0;
  int out_type = 0;
  tmap_sam_io_t *io_out = NULL;
  bam_header_t *header = NULL; // BAM Header
  int32_t i;

  /*
  uint8_t *key_seq = NULL;
  int key_seq_len = 0;
  */

  while((c = getopt(argc, argv, "DGR:Yvh")) >= 0) {
      switch(c) {
        case 'D': bidirectional = 1; break;
        case 'G': break;
        case 'R':
                  sam_rg = tmap_realloc(sam_rg, (1+sam_rg_num) * sizeof(char*), "sam_rg");
                  sam_rg[sam_rg_num] = tmap_strdup(optarg);
                  sam_rg_num++;
                  break;
        case 'Y': sam_flowspace_tags = 1; break;
        case 'v': tmap_progress_set_verbosity(1); break;
        case 'h': help = 1; break;
        default: return 1;
      }
  }
  if(1 != argc - optind || 1 == help) {
      tmap_file_fprintf(tmap_file_stderr, "Usage: %s %s [-R -Y -v -h] <in.sff>\n", PACKAGE, argv[0]);
      return 1; 
  }

  // input
  io_in = tmap_seqs_io_init(&argv[optind], 1, TMAP_SEQ_TYPE_SFF, TMAP_FILE_NO_COMPRESSION, 0l, 0l);

  // BAM Header
  header = tmap_seqs_io_to_bam_header(NULL, io_in, sam_rg, sam_rg_num, argc, argv);

  // open the output file
  switch(out_type) {
    case 0: // SAM
      io_out = tmap_sam_io_init2("-", "wh", header);
      break;
    case 1:
      io_out = tmap_sam_io_init2("-", "wb", header);
      break;
    case 2:
      io_out = tmap_sam_io_init2("-", "wbu", header);
      break;
    default:
      tmap_bug();
  }

  // destroy the BAM Header
  bam_header_destroy(header);
  header = NULL;

  seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF);
  while(0 < tmap_seqs_io_read(io_in, seqs, io_out->fp->header->header)) {
      bam1_t *b = NULL;
      tmap_seq_t *seq = seqs->seqs[0];
      b = tmap_sam_convert_unmapped(seq, sam_flowspace_tags, bidirectional, NULL,
                                    0, 0, 0,
                                    0, 0, 0,
                                    "\tlq:i:%d\trq:i:%d\tla:i:%d\trq:i:%d",
                                    seq->data.sff->rheader->clip_qual_left,
                                    seq->data.sff->rheader->clip_qual_right,
                                    seq->data.sff->rheader->clip_adapter_left,
                                    seq->data.sff->rheader->clip_adapter_right);
      if(samwrite(io_out->fp, b) <= 0) {
          tmap_error("Error writing the SAM file", Exit, WriteFileError);
      }
      bam_destroy1(b); 
      tmap_seqs_destroy(seqs);
      seqs = tmap_seqs_init(TMAP_SEQ_TYPE_SFF);
  }
  tmap_seqs_destroy(seqs);

  // free memory
  tmap_seqs_io_destroy(io_in);
  tmap_sam_io_destroy(io_out);
  for(i=0;i<sam_rg_num;i++) {
      free(sam_rg[i]);
  }
  free(sam_rg);

  return 0;
}
Beispiel #21
0
int main(int argc, char *argv[]) {
    int c;
    const char* normal_sample_id = _default_normal_sample_id;
    const char* tumor_sample_id = _default_tumor_sample_id;
    const char *fn_fa = 0;
    pu_data2_t *d = (pu_data2_t*)calloc(1, sizeof(pu_data2_t));
    d->min_somatic_qual=15;
    d->tid = -1; d->mask = BAM_DEF_MASK; d->mapQ = 0;
    d->c = sniper_maqcns_init();
    int use_priors = 1;
    d->include_loh = 1;
    d->include_gor = 1;
    d->use_joint_priors = 0;
    d->somatic_mutation_rate = 0.01;
    const char *output_format = "classic";

    while ((c = getopt(argc, argv, "n:t:vf:T:N:r:I:q:Q:pLGJs:F:")) >= 0) {
        switch (c) {
            case 'f': fn_fa = optarg; break;
            case 'T': d->c->theta = atof(optarg); break;
            case 'N': d->c->n_hap = atoi(optarg); break;
            case 'r': d->c->het_rate = atof(optarg); break;
            case 'q': d->mapQ = atoi(optarg); break;
            case 'Q': d->min_somatic_qual = atoi(optarg); break;
            case 'F': output_format = optarg; break;
            case 'p': use_priors = 0; break;
            case 'J': d->use_joint_priors = 1; break;
            case 's': d->somatic_mutation_rate = atof(optarg); d->use_joint_priors = 1; break;
            case 'v': version_info(); exit(0); break;
            case 't': tumor_sample_id = optarg; break;
            case 'n': normal_sample_id = optarg; break;
            case 'L': d->include_loh = 0; break;
            case 'G': d->include_gor = 0; break;
            default: fprintf(stderr, "Unrecognizd option '-%c'.\n", c); return 1;
        }
    }

    if (optind == argc) {
        usage(argv[0], d);
        sniper_maqcns_destroy(d->c); free(d);
        return 1;
    }
    if (fn_fa) {
        d->fai = fai_load(fn_fa);
    }
    else {
        fprintf(stderr, "You MUST specify a reference sequence. It isn't optional.\n");
        sniper_maqcns_destroy(d->c);
        free(d);
        exit(1);
    }
    if(d->use_joint_priors) {
        fprintf(stderr,"Using priors accounting for somatic mutation rate. Prior probability of a somatic mutation is %f\n",d->somatic_mutation_rate);
        make_joint_prior(d->somatic_mutation_rate);
    }

    sniper_maqcns_prepare(d->c);
    fprintf(stderr,"Preparing to snipe some somatics\n");
    if(use_priors) {
        fprintf(stderr,"Using prior probabilities\n");
        makeSoloPrior();
    }
    bamFile fp1, fp2;
    qAddTableInit();
    fp1 = (strcmp(argv[optind], "-") == 0)? bam_dopen(fileno(stdin), "r") : bam_open(argv[optind], "r");
    fprintf(stderr, "Normal bam is %s\n", argv[optind+1]);
    fprintf(stderr, "Tumor bam is %s\n", argv[optind]);
    d->h1 = bam_header_read(fp1);
    sam_header_parse_rg(d->h1);
    fp2 = bam_open(argv[optind+1], "r");
    d->h2 = bam_header_read(fp2);
    sam_header_parse_rg(d->h2);
    FILE* snp_fh = fopen(argv[optind+2], "w");
    /* this will exit if the format name is invalid */
    output_formatter_t fmt = output_formatter_create(output_format, snp_fh);
    d->output_formatter = &fmt;
    if(snp_fh) {
        header_data_t hdr;
        hdr.refseq = fn_fa;
        hdr.normal_sample_id = normal_sample_id;
        hdr.tumor_sample_id = tumor_sample_id;
        d->output_formatter->header_fn(snp_fh, &hdr);
        bam_sspileup_file(fp1, fp2, d->mask, d->mapQ, glf_somatic, d, snp_fh);
    }
    else {
        fprintf(stderr, "Unable to open snp file!!!!!!!!!\n");
        exit(1);
    }

    bam_close(fp1);
    bam_close(fp2);
    bam_header_destroy(d->h1);
    bam_header_destroy(d->h2);
    if (d->fai) fai_destroy(d->fai);
    sniper_maqcns_destroy(d->c);
    free(d->ref); free(d);
    fclose(snp_fh);
    return 0;
}
Beispiel #22
0
/*!
  @abstract Sort an unsorted BAM file based on the chromosome order
  and the leftmost position of an alignment

  @param  is_by_qname whether to sort by query name
  @param  fn       name of the file to be sorted
  @param  prefix   prefix of the output and the temporary files; upon
	                   sucessess, prefix.bam will be written.
  @param  max_mem  approxiate maximum memory (very inaccurate)

  @discussion It may create multiple temporary subalignment files
  and then merge them by calling bam_merge_core(). This function is
  NOT thread safe.
 */
void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t max_mem, int is_stdout)
{
	int n, ret, k, i;
	size_t mem;
	bam_header_t *header;
	bamFile fp;
	bam1_t *b, **buf;

	g_is_by_qname = is_by_qname;
	n = k = 0; mem = 0;
	fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
	if (fp == 0) {
		fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
		return;
	}
	header = bam_header_read(fp);
    if (is_by_qname) change_SO(header, "queryname");
    else change_SO(header, "coordinate");
	buf = (bam1_t**)calloc(max_mem / BAM_CORE_SIZE, sizeof(bam1_t*));
	// write sub files
	for (;;) {
		if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
		b = buf[k];
		if ((ret = bam_read1(fp, b)) < 0) break;
		mem += ret;
		++k;
		if (mem >= max_mem) {
			sort_blocks(n++, k, buf, prefix, header, 0);
			mem = 0; k = 0;
		}
	}
	if (ret != -1)
		fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
	if (n == 0) sort_blocks(-1, k, buf, prefix, header, is_stdout);
	else { // then merge
		char **fns, *fnout;
		fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n+1);
		sort_blocks(n++, k, buf, prefix, header, 0);
		fnout = (char*)calloc(strlen(prefix) + 20, 1);
		if (is_stdout) sprintf(fnout, "-");
		else sprintf(fnout, "%s.bam", prefix);
		fns = (char**)calloc(n, sizeof(char*));
		for (i = 0; i < n; ++i) {
			fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
			sprintf(fns[i], "%s.%.4d.bam", prefix, i);
		}
		bam_merge_core(is_by_qname, fnout, 0, n, fns, 0, 0);
		free(fnout);
		for (i = 0; i < n; ++i) {
			unlink(fns[i]);
			free(fns[i]);
		}
		free(fns);
	}
        for (k = 0; (size_t)k < max_mem / BAM_CORE_SIZE; ++k) {
		if (buf[k]) {
			free(buf[k]->data);
			free(buf[k]);
		}
	}
	free(buf);
	bam_header_destroy(header);
	bam_close(fp);
}
Beispiel #23
0
samfile_t *samopen(const char *fn, const char *mode, const void *aux)
{
	samfile_t *fp;
	fp = (samfile_t*)calloc(1, sizeof(samfile_t));
	if (mode[0] == 'r') { // read
		fp->type |= TYPE_READ;
		if (mode[1] == 'b') { // binary
			fp->type |= TYPE_BAM;
			fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
			if (fp->x.bam == 0) goto open_err_ret;
			fp->header = bam_header_read(fp->x.bam);
		} else { // text
			fp->x.tamr = sam_open(fn);
			if (fp->x.tamr == 0) goto open_err_ret;
			fp->header = sam_header_read(fp->x.tamr);
			if (fp->header->n_targets == 0) { // no @SQ fields
				if (aux) { // check if aux is present
					bam_header_destroy(fp->header);
					fp->header = sam_header_read2((const char*)aux);
				}
				if (fp->header->n_targets == 0)
					fprintf(stderr, "[samopen] empty header.\n");
			} else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
		}
	} else if (mode[0] == 'w') { // write
		fp->header = bam_header_dup((const bam_header_t*)aux);
		if (mode[1] == 'b') { // binary
			fp->type |= TYPE_BAM;
			fp->x.bam = strcmp(fn, "-")? bam_open(fn, "w") : bam_dopen(fileno(stdout), "w");
			if (fp->x.bam == 0) goto open_err_ret;
			bam_header_write(fp->x.bam, fp->header);
		} else { // text
			// open file
			fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
			if (fp->x.tamr == 0) goto open_err_ret;
			// write header
			if (strstr(mode, "h")) {
				int i;
				bam_header_t *alt;
				// parse the header text 
				alt = bam_header_init();
				alt->l_text = fp->header->l_text; alt->text = fp->header->text;
				sam_header_parse(alt);
				alt->l_text = 0; alt->text = 0;
				// check if there are @SQ lines in the header
				if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
					if (alt->n_targets != fp->header->n_targets)
						fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
					fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
				} else { // then dump ->target_{name,len}
					for (i = 0; i < fp->header->n_targets; ++i)
						fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
				}
				bam_header_destroy(alt);
			}
		}
	}
	return fp;

open_err_ret:
	free(fp);
	return 0;
}
Beispiel #24
0
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
	extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
	extern void bcf_call_del_rghash(void *rghash);
	mplp_aux_t **data;
	int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
	const bam_pileup1_t **plp;
	bam_mplp_t iter;
	bam_header_t *h = 0;
	char *ref;
	void *rghash = 0;

	bcf_callaux_t *bca = 0;
	bcf_callret1_t *bcr = 0;
	bcf_call_t bc;
	bcf_t *bp = 0;
	bcf_hdr_t *bh = 0;

	bam_sample_t *sm = 0;
	kstring_t buf;
	mplp_pileup_t gplp;

	memset(&gplp, 0, sizeof(mplp_pileup_t));
	memset(&buf, 0, sizeof(kstring_t));
	memset(&bc, 0, sizeof(bcf_call_t));
	data = calloc(n, sizeof(void*));
	plp = calloc(n, sizeof(void*));
	n_plp = calloc(n, sizeof(int*));
	sm = bam_smpl_init();

	// read the header and initialize data
	for (i = 0; i < n; ++i) {
		bam_header_t *h_tmp;
		data[i] = calloc(1, sizeof(mplp_aux_t));
		data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
		data[i]->conf = conf;
		h_tmp = bam_header_read(data[i]->fp);
		data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
		bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
		rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
		if (conf->reg) {
			int beg, end;
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			if (idx == 0) {
				fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
				fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
			data[i]->iter = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
		if (i == 0) h = h_tmp;
		else {
			// FIXME: to check consistency
			bam_header_destroy(h_tmp);
		}
	}
	gplp.n = sm->n;
	gplp.n_plp = calloc(sm->n, sizeof(int));
	gplp.m_plp = calloc(sm->n, sizeof(int));
	gplp.plp = calloc(sm->n, sizeof(void*));

	fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
	// write the VCF header
	if (conf->flag & MPLP_GLF) {
		kstring_t s;
		bh = calloc(1, sizeof(bcf_hdr_t));
		s.l = s.m = 0; s.s = 0;
		bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
		for (i = 0; i < h->n_targets; ++i) {
			kputs(h->target_name[i], &s);
			kputc('\0', &s);
		}
		bh->l_nm = s.l;
		bh->name = malloc(s.l);
		memcpy(bh->name, s.s, s.l);
		s.l = 0;
		for (i = 0; i < sm->n; ++i) {
			kputs(sm->smpl[i], &s); kputc('\0', &s);
		}
		bh->l_smpl = s.l;
		bh->sname = malloc(s.l);
		memcpy(bh->sname, s.s, s.l);
		bh->txt = malloc(strlen(BAM_VERSION) + 64);
		bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
		free(s.s);
		bcf_hdr_sync(bh);
		bcf_hdr_write(bp, bh);
		bca = bcf_call_init(-1., conf->min_baseQ);
		bcr = calloc(sm->n, sizeof(bcf_callret1_t));
		bca->rghash = rghash;
		bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
		bca->min_frac = conf->min_frac;
		bca->min_support = conf->min_support;
	}
	if (tid0 >= 0 && conf->fai) { // region is set
		ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
		ref_tid = tid0;
		for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
	} else ref_tid = -1, ref = 0;
	iter = bam_mplp_init(n, mplp_func, (void**)data);
	max_depth = conf->max_depth;
	if (max_depth * sm->n > 1<<20)
		fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
	if (max_depth * sm->n < 8000) {
		max_depth = 8000 / sm->n;
		fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
	}
	max_indel_depth = conf->max_indel_depth * sm->n;
	bam_mplp_set_maxcnt(iter, max_depth);


	int storeSize = 100;

	int delStore[2][100] = {{0},{0}};

	typedef char * mstring;

	while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
		if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
		if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
		if (tid != ref_tid) {
			free(ref); ref = 0;
			if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
			for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
			ref_tid = tid;
		}
		if (conf->flag & MPLP_GLF) {
			int total_depth, _ref0, ref16;
			bcf1_t *b = calloc(1, sizeof(bcf1_t));
			for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
			group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
			_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
			ref16 = bam_nt16_table[_ref0];
			for (i = 0; i < gplp.n; ++i)
				bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
			bcf_call_combine(gplp.n, bcr, ref16, &bc);
			bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
						 (conf->flag&MPLP_FMT_SP), 0, 0);
			bcf_write(bp, bh, b);
			bcf_destroy(b);
			// call indels
			if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
				for (i = 0; i < gplp.n; ++i)
					bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
				if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
					b = calloc(1, sizeof(bcf1_t));
					bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
								 (conf->flag&MPLP_FMT_SP), bca, ref);
					bcf_write(bp, bh, b);
					bcf_destroy(b);
				}
			}
		} else {
			printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
			for (i = 0; i < n; ++i) {
				int j;
				printf("\t%d\t", n_plp[i]);
				if (n_plp[i] == 0) {
					printf("*\t*"); // FIXME: printf() is very slow...
					if (conf->flag & MPLP_PRINT_POS) printf("\t*");
				} else {
					//MDW start					
					//for each position in the pileup column
					int charLen = 16;
					int countChars[ charLen ][2];
					int countiChars[ charLen ][2];

					int countGap[2]={0,0};

					//double qvTotal=0;
					int numStruck=0;
					int numGood=0;
					int tti;
					int ttj;
					mstring insAllele[100];
					int insAlleleCnt[100];
					int sf=0;
					int flag=0;

					//typedef char * string;
					char insStr0[10000];
					int iCnt0=0;

					char insStr1[10000];
					int iCnt1=0;

					char delStr0[10000];
					int dCnt0=0;

					char delStr1[10000];
					int dCnt1=0;


					float qposP[10000];
					int qposCnt=0;



					//initialize with zeros
						for(tti=0;tti<charLen;tti++){
						  countChars[tti][0]=0;
						  countChars[tti][1]=0;
						}

					// define repeat length here; look back up to 10 prior positions
					// start one position away.
					int replC=0; //
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos-1])==toupper(ref[pos-tti])){
							replC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}					
					int reprC=0; // 
					for(tti=1;tti<=15;tti++){
						// check for greater than zero
						if(toupper(ref[pos+1])==toupper(ref[pos+tti])){
							reprC++;
						}else{ // breaks the chain at first non identical to current position not strict homopolymer
							break;
						}
					}		
					int repT = replC;
					if(replC < reprC){
						repT=reprC;
					}



					for (j = 0; j < n_plp[i]; ++j){
						const bam_pileup1_t *p = plp[i] + j;
									
						/*
						SAME LOGIC AS pileup_seq()
						*/

						if(p->is_refskip){ // never count intron gaps in numStruck
							continue;
						}

						if(p->is_del){ // skip deletion gap, after first position which is the first aligned char
							continue;
						}

						if( 	p->b->core.qual < conf->min_mqToCount  || // mapping quality
							conf->maxrepC < (repT) || // max homopolymer run, this will not 
							(!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches
							p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) ||  // trimEnd is 1-based
							p->zf == 1 || // fusion tag
							p->ih > conf->maxIH  || // max hit index
							(p->nmd > conf->maxNM) || // max mismatch
							(conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs
							(conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary
							(conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup
							(conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY))  || // optionally strike secondary or dup
							(conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) ))   // optionally strike secondary, dup and QCfail


						){
							numStruck++;
							continue;
						}

						
						//printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]);


						if(!p->is_del && p->indel==0){
  						  countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++;
						  numGood++;			

						}else if(p->is_refskip){
						  countGap[ bam1_strand(p->b) ]++;
						}
						
						if(p->indel<0){
    						  numGood++;			
						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr0[dCnt0] =  ref[pos+tti];
							   dCnt0++;
							  }	
							  delStr0[dCnt0] = ',';
							  dCnt0++;
						  }else{
							  for(tti=1;tti<= -p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   delStr1[dCnt1] = ref[pos+tti];
							   dCnt1++;
							  }	
							  delStr1[dCnt1] = ',';
							  dCnt1++;
						  }



						}else if(p->indel>0){
						  numGood++;			

						  if(bam1_strand(p->b) ==0){
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt0++;
							  }	
							  insStr0[iCnt0] = ',';
							  iCnt0++;
						  }else{
							  for(tti=1;tti<= p->indel; tti++) {
							    // current spot, starting at 0 in store, because indel<0 refers to next position
							   insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)];
							   iCnt1++;
							  }	
							  insStr1[iCnt1] = ',';
							  iCnt1++;
						  }


						}
						//calculate position of variant within aligned read - no soft clips
						if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0  ){

						//distance to end; calculate distance to end of aligned read.  removes soft clips.
						int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd;
						qposP[qposCnt] = distToEnd;						  
						qposCnt++;	
						// printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd);
						}	
					}

					//

					//print A,C,G,T, by +/-
				        printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", 	countChars[1][0],countChars[1][1],
											countChars[2][0],countChars[2][1],
											countChars[4][0],countChars[4][1],
											countChars[8][0],countChars[8][1],
											countChars[7][0],countChars[7][1]);
					
					putchar('\t');
					for(tti=0;tti<dCnt0;tti++){
					  putchar(delStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<dCnt1;tti++){
					  putchar(delStr1[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt0;tti++){
					  putchar(insStr0[tti]);
					}

					putchar('\t');
					for(tti=0;tti<iCnt1;tti++){
					  putchar(insStr1[tti]);
					}

					printf("\t%d\t%d",numGood,numStruck);					

					// get non-ref qpos variation

					float medqpos = -1;
					float medAbsDev = -1;
					if(qposCnt>0){
					  medqpos = median(qposCnt,qposP);
					  float absDev[qposCnt];
					  for(tti=0;tti<qposCnt;tti++){
						absDev[tti] = abs(medqpos - qposP[tti]);
					  }
					  medAbsDev = median(qposCnt-1,absDev);
					}
					printf("\t%f",medAbsDev);

					///END MDW
				}



			}
			putchar('\n');
		}
	}

	bcf_close(bp);
	bam_smpl_destroy(sm); free(buf.s);
	for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
	free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
	bcf_call_del_rghash(rghash);
	bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
	bam_mplp_destroy(iter);
	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(plp); free(ref); free(n_plp);
	return 0;
}
Beispiel #25
0
int bam_merge_core2(int by_qname, const char *out, const char *headers, int n, char * const *fn, int flag, const char *reg, int level)
#endif
{
	bamFile fpout, *fp;
	heap1_t *heap;
	bam_header_t *hout = 0;
	bam_header_t *hheaders = NULL;
	int i, j, *RG_len = 0;
	uint64_t idx = 0;
	char **RG = 0, mode[8];
	bam_iter_t *iter = 0;

	if (headers) {
		tamFile fpheaders = sam_open(headers);
		if (fpheaders == 0) {
			const char *message = strerror(errno);
			fprintf(stderr, "[bam_merge_core] cannot open '%s': %s\n", headers, message);
			return -1;
		}
		hheaders = sam_header_read(fpheaders);
		sam_close(fpheaders);
	}

	g_is_by_qname = by_qname;
	fp = (bamFile*)calloc(n, sizeof(bamFile));
	heap = (heap1_t*)calloc(n, sizeof(heap1_t));
	iter = (bam_iter_t*)calloc(n, sizeof(bam_iter_t));
	// prepare RG tag
	if (flag & MERGE_RG) {
		RG = (char**)calloc(n, sizeof(void*));
		RG_len = (int*)calloc(n, sizeof(int));
		for (i = 0; i != n; ++i) {
			int l = strlen(fn[i]);
			const char *s = fn[i];
			if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4;
			for (j = l - 1; j >= 0; --j) if (s[j] == '/') break;
			++j; l -= j;
			RG[i] = calloc(l + 1, 1);
			RG_len[i] = l;
			strncpy(RG[i], s + j, l);
		}
	}
	// read the first
	for (i = 0; i != n; ++i) {
		bam_header_t *hin;
		fp[i] = bam_open(fn[i], "r");
		if (fp[i] == 0) {
			int j;
			fprintf(stderr, "[bam_merge_core] fail to open file %s\n", fn[i]);
			for (j = 0; j < i; ++j) bam_close(fp[j]);
			free(fp); free(heap);
			// FIXME: possible memory leak
			return -1;
		}
		hin = bam_header_read(fp[i]);
		if (i == 0) { // the first BAM
			hout = hin;
		} else { // validate multiple baf
			int min_n_targets = hout->n_targets;
			if (hin->n_targets < min_n_targets) min_n_targets = hin->n_targets;

			for (j = 0; j < min_n_targets; ++j)
				if (strcmp(hout->target_name[j], hin->target_name[j]) != 0) {
					fprintf(stderr, "[bam_merge_core] different target sequence name: '%s' != '%s' in file '%s'\n",
							hout->target_name[j], hin->target_name[j], fn[i]);
					return -1;
				}

			// If this input file has additional target reference sequences,
			// add them to the headers to be output
			if (hin->n_targets > hout->n_targets) {
				swap_header_targets(hout, hin);
				// FIXME Possibly we should also create @SQ text headers
				// for the newly added reference sequences
			}

			bam_header_destroy(hin);
		}
	}

	if (hheaders) {
		// If the text headers to be swapped in include any @SQ headers,
		// check that they are consistent with the existing binary list
		// of reference information.
		if (hheaders->n_targets > 0) {
			if (hout->n_targets != hheaders->n_targets) {
				fprintf(stderr, "[bam_merge_core] number of @SQ headers in '%s' differs from number of target sequences\n", headers);
				if (!reg) return -1;
			}
			for (j = 0; j < hout->n_targets; ++j)
				if (strcmp(hout->target_name[j], hheaders->target_name[j]) != 0) {
					fprintf(stderr, "[bam_merge_core] @SQ header '%s' in '%s' differs from target sequence\n", hheaders->target_name[j], headers);
					if (!reg) return -1;
				}
		}

		swap_header_text(hout, hheaders);
		bam_header_destroy(hheaders);
	}

	if (reg) {
		int tid, beg, end;
		if (bam_parse_region(hout, reg, &tid, &beg, &end) < 0) {
			fprintf(stderr, "[%s] Malformated region string or undefined reference name\n", __func__);
			return -1;
		}
		for (i = 0; i < n; ++i) {
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			iter[i] = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
	}

	for (i = 0; i < n; ++i) {
		heap1_t *h = heap + i;
		h->i = i;
		h->b = (bam1_t*)calloc(1, sizeof(bam1_t));
		if (bam_iter_read(fp[i], iter[i], h->b) >= 0) {
			h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam1_strand(h->b);
			h->idx = idx++;
		}
		else h->pos = HEAP_EMPTY;
	}
	if (flag & MERGE_UNCOMP) level = 0;
	else if (flag & MERGE_LEVEL1) level = 1;
	strcpy(mode, "w");
	if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
	if ((fpout = strcmp(out, "-")? bam_open(out, "w") : bam_dopen(fileno(stdout), "w")) == 0) {
		fprintf(stderr, "[%s] fail to create the output file.\n", __func__);
		return -1;
	}
	bam_header_write(fpout, hout);
	bam_header_destroy(hout);
#ifndef _PBGZF_USE 
	if (!(flag & MERGE_UNCOMP)) bgzf_mt(fpout, n_threads, 256);
#endif

	ks_heapmake(heap, n, heap);
	while (heap->pos != HEAP_EMPTY) {
		bam1_t *b = heap->b;
		if (flag & MERGE_RG) {
			uint8_t *rg = bam_aux_get(b, "RG");
			if (rg) bam_aux_del(b, rg);
			bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]);
		}
		bam_write1_core(fpout, &b->core, b->data_len, b->data);
		if ((j = bam_iter_read(fp[heap->i], iter[heap->i], b)) >= 0) {
			heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam1_strand(b);
			heap->idx = idx++;
		} else if (j == -1) {
			heap->pos = HEAP_EMPTY;
			free(heap->b->data); free(heap->b);
			heap->b = 0;
		} else fprintf(stderr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]);
		ks_heapadjust(heap, 0, n, heap);
	}

	if (flag & MERGE_RG) {
		for (i = 0; i != n; ++i) free(RG[i]);
		free(RG); free(RG_len);
	}
	for (i = 0; i != n; ++i) {
		bam_iter_destroy(iter[i]);
		bam_close(fp[i]);
	}
	bam_close(fpout);
	free(fp); free(heap); free(iter);
	return 0;
}
Beispiel #26
0
/*!
  @abstract Sort an unsorted BAM file based on the chromosome order
  and the leftmost position of an alignment

  @param  is_by_qname whether to sort by query name
  @param  fn       name of the file to be sorted
  @param  prefix   prefix of the output and the temporary files; upon
	                   sucessess, prefix.bam will be written.
  @param  max_mem  approxiate maximum memory (very inaccurate)

  @discussion It may create multiple temporary subalignment files
  and then merge them by calling bam_merge_core(). This function is
  NOT thread safe.
 */
void bam_sort_core_ext(int is_by_qname, const char *fn, const char *prefix, size_t _max_mem, int is_stdout, int n_threads, int level, int sort_type)
{
	int ret, i, n_files = 0;
	size_t mem, max_k, k, max_mem;
	bam_header_t *header;
	bamFile fp;
	bam1_t *b, **buf;
	char *fnout = 0;

	if (n_threads < 2) n_threads = 1;
	g_is_by_qname = is_by_qname;
	max_k = k = 0; mem = 0;
	max_mem = _max_mem * n_threads;
	buf = 0;
	fp = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
	if (fp == 0) {
		fprintf(stderr, "[bam_sort_core] fail to open file %s\n", fn);
		return;
	}
	header = bam_header_read(fp);
	if (is_by_qname) change_SO(header, "queryname");
	else change_SO(header, "coordinate");
	// write sub files
	for (;;) {
		if (k == max_k) {
			size_t old_max = max_k;
			max_k = max_k? max_k<<1 : 0x10000;
			buf = realloc(buf, max_k * sizeof(void*));
			memset(buf + old_max, 0, sizeof(void*) * (max_k - old_max));
		}
		if (buf[k] == 0) buf[k] = (bam1_t*)calloc(1, sizeof(bam1_t));
		b = buf[k];
		if ((ret = bam_read1(fp, b)) < 0) break;
		if (b->data_len < b->m_data>>2) { // shrink
			b->m_data = b->data_len;
			kroundup32(b->m_data);
			b->data = realloc(b->data, b->m_data);
		}
		mem += sizeof(bam1_t) + b->m_data + sizeof(void*) + sizeof(void*); // two sizeof(void*) for the data allocated to pointer arrays
		++k;
		if (mem >= max_mem) {
			n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type);
			mem = k = 0;
		}
	}
	if (ret != -1)
		fprintf(stderr, "[bam_sort_core] truncated file. Continue anyway.\n");
	// output file name
	fnout = calloc(strlen(prefix) + 20, 1);
	if (is_stdout) sprintf(fnout, "-");
	else sprintf(fnout, "%s.bam", prefix);
	// write the final output
	if (n_files == 0) { // a single block
		char mode[8];
		strcpy(mode, "w");
		if (level >= 0) sprintf(mode + 1, "%d", level < 9? level : 9);
                sort_aux_core(k, buf, sort_type);
#ifndef _PBGZF_USE 
		write_buffer(fnout, mode, k, buf, header, n_threads);
#else
		write_buffer(fnout, mode, k, buf, header);
#endif
	} else { // then merge
		char **fns;
		n_files = sort_blocks(n_files, k, buf, prefix, header, n_threads, sort_type);
		fprintf(stderr, "[bam_sort_core] merging from %d files...\n", n_files);
		fns = (char**)calloc(n_files, sizeof(char*));
		for (i = 0; i < n_files; ++i) {
			fns[i] = (char*)calloc(strlen(prefix) + 20, 1);
			sprintf(fns[i], "%s.%.4d.bam", prefix, i);
		}
#ifndef _PBGZF_USE 
		bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, n_threads, level);
#else
		bam_merge_core2(is_by_qname, fnout, 0, n_files, fns, 0, 0, level);
#endif
		for (i = 0; i < n_files; ++i) {
			unlink(fns[i]);
			free(fns[i]);
		}
		free(fns);
	}
	free(fnout);
	// free
	for (k = 0; k < max_k; ++k) {
		if (!buf[k]) continue;
		free(buf[k]->data);
		free(buf[k]);
	}
	free(buf);
	bam_header_destroy(header);
	bam_close(fp);
}
Beispiel #27
0
void bam_header_free(bam_header_t *header) {
    bam_header_destroy(header);
}
samfile_t *samopen(const char *fn, const char *mode, const void *aux)
{
	samfile_t *fp;
	fp = (samfile_t*)calloc(1, sizeof(samfile_t));
	if (strchr(mode, 'r')) { // read
		fp->type |= TYPE_READ;
		if (strchr(mode, 'b')) { // binary
			fp->type |= TYPE_BAM;
			fp->x.bam = strcmp(fn, "-")? bam_open(fn, "r") : bam_dopen(fileno(stdin), "r");
			if (fp->x.bam == 0) goto open_err_ret;
			fp->header = bam_header_read(fp->x.bam);
		} else { // text
			fp->x.tamr = sam_open(fn);
			if (fp->x.tamr == 0) goto open_err_ret;
			fp->header = sam_header_read(fp->x.tamr);
			if (fp->header->n_targets == 0) { // no @SQ fields
				if (aux) { // check if aux is present
					bam_header_t *textheader = fp->header;
					fp->header = sam_header_read2((const char*)aux);
					if (fp->header == 0) goto open_err_ret;
					append_header_text(fp->header, textheader->text, textheader->l_text);
					bam_header_destroy(textheader);
				}
				if (fp->header->n_targets == 0 && bam_verbose >= 1)
					fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
			} //else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
		}
	} else if (strchr(mode, 'w')) { // write
		fp->header = bam_header_dup((const bam_header_t*)aux);
		if (strchr(mode, 'b')) { // binary
			char bmode[3];
			int i, compress_level = -1;
			for (i = 0; mode[i]; ++i) if (mode[i] >= '0' && mode[i] <= '9') break;
			if (mode[i]) compress_level = mode[i] - '0';
			if (strchr(mode, 'u')) compress_level = 0;
			bmode[0] = 'w'; bmode[1] = compress_level < 0? 0 : compress_level + '0'; bmode[2] = 0;
			fp->type |= TYPE_BAM;
			fp->x.bam = strcmp(fn, "-")? bam_open(fn, bmode) : bam_dopen(fileno(stdout), bmode);
			if (fp->x.bam == 0) goto open_err_ret;
			bam_header_write(fp->x.bam, fp->header);
		} else { // text
			// open file
			fp->x.tamw = strcmp(fn, "-")? fopen(fn, "w") : stdout;
			if (fp->x.tamw == 0) goto open_err_ret;
			if (strchr(mode, 'X')) fp->type |= BAM_OFSTR<<2;
			else if (strchr(mode, 'x')) fp->type |= BAM_OFHEX<<2;
			else fp->type |= BAM_OFDEC<<2;
			// write header
			if (strchr(mode, 'h')) {
				int i;
				bam_header_t *alt;
				// parse the header text 
				alt = bam_header_init();
				alt->l_text = fp->header->l_text; alt->text = fp->header->text;
				sam_header_parse(alt);
				alt->l_text = 0; alt->text = 0;
				// check if there are @SQ lines in the header
				fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL
				if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
					if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
						fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
				} else { // then dump ->target_{name,len}
					for (i = 0; i < fp->header->n_targets; ++i)
						fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
				}
				bam_header_destroy(alt);
			}
		}
	}
	return fp;

open_err_ret:
	free(fp);
	return 0;
}
Beispiel #29
0
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
	extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
	extern void bcf_call_del_rghash(void *rghash);
	mplp_aux_t **data;
	int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
	const bam_pileup1_t **plp;
	bam_mplp_t iter;
	bam_header_t *h = 0;
	char *ref;
	void *rghash = 0;

	bcf_callaux_t *bca = 0;
	bcf_callret1_t *bcr = 0;
	bcf_call_t bc;
	bcf_t *bp = 0;
	bcf_hdr_t *bh = 0;

	bam_sample_t *sm = 0;
	kstring_t buf;
	mplp_pileup_t gplp;

	memset(&gplp, 0, sizeof(mplp_pileup_t));
	memset(&buf, 0, sizeof(kstring_t));
	memset(&bc, 0, sizeof(bcf_call_t));
	data = calloc(n, sizeof(void*));
	plp = calloc(n, sizeof(void*));
	n_plp = calloc(n, sizeof(int*));
	sm = bam_smpl_init();

	// read the header and initialize data
	for (i = 0; i < n; ++i) {
		bam_header_t *h_tmp;
		data[i] = calloc(1, sizeof(mplp_aux_t));
		data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r");
		data[i]->conf = conf;
		h_tmp = bam_header_read(data[i]->fp);
		data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
		bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
		rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
		if (conf->reg) {
			int beg, end;
			bam_index_t *idx;
			idx = bam_index_load(fn[i]);
			if (idx == 0) {
				fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) {
				fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1);
				exit(1);
			}
			if (i == 0) tid0 = tid, beg0 = beg, end0 = end;
			data[i]->iter = bam_iter_query(idx, tid, beg, end);
			bam_index_destroy(idx);
		}
		if (i == 0) h = h_tmp;
		else {
			// FIXME: to check consistency
			bam_header_destroy(h_tmp);
		}
	}
	gplp.n = sm->n;
	gplp.n_plp = calloc(sm->n, sizeof(int));
	gplp.m_plp = calloc(sm->n, sizeof(int));
	gplp.plp = calloc(sm->n, sizeof(void*));

	fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
	// write the VCF header
	if (conf->flag & MPLP_GLF) {
		kstring_t s;
		bh = calloc(1, sizeof(bcf_hdr_t));
		s.l = s.m = 0; s.s = 0;
		bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w");
		for (i = 0; i < h->n_targets; ++i) {
			kputs(h->target_name[i], &s);
			kputc('\0', &s);
		}
		bh->l_nm = s.l;
		bh->name = malloc(s.l);
		memcpy(bh->name, s.s, s.l);
		s.l = 0;
		for (i = 0; i < sm->n; ++i) {
			kputs(sm->smpl[i], &s); kputc('\0', &s);
		}
		bh->l_smpl = s.l;
		bh->sname = malloc(s.l);
		memcpy(bh->sname, s.s, s.l);
		bh->txt = malloc(strlen(BAM_VERSION) + 64);
		bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION);
		free(s.s);
		bcf_hdr_sync(bh);
		bcf_hdr_write(bp, bh);
		bca = bcf_call_init(-1., conf->min_baseQ);
		bcr = calloc(sm->n, sizeof(bcf_callret1_t));
		bca->rghash = rghash;
		bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
		bca->min_frac = conf->min_frac;
		bca->min_support = conf->min_support;
	}
	if (tid0 >= 0 && conf->fai) { // region is set
		ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
		ref_tid = tid0;
		for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
	} else ref_tid = -1, ref = 0;
	iter = bam_mplp_init(n, mplp_func, (void**)data);
	max_depth = conf->max_depth;
	if (max_depth * sm->n > 1<<20)
		fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
	if (max_depth * sm->n < 8000) {
		max_depth = 8000 / sm->n;
		fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
	}
	max_indel_depth = conf->max_indel_depth * sm->n;
	bam_mplp_set_maxcnt(iter, max_depth);
	while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) {
		if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
		if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
		if (tid != ref_tid) {
			free(ref); ref = 0;
			if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
			for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
			ref_tid = tid;
		}
		if (conf->flag & MPLP_GLF) {
			int total_depth, _ref0, ref16;
			bcf1_t *b = calloc(1, sizeof(bcf1_t));
			for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
			group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
			_ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
			ref16 = bam_nt16_table[_ref0];
			for (i = 0; i < gplp.n; ++i)
				bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
			bcf_call_combine(gplp.n, bcr, ref16, &bc);
			bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
						 (conf->flag&MPLP_FMT_SP), 0, 0);
			bcf_write(bp, bh, b);
			bcf_destroy(b);
			// call indels
			if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) {
				for (i = 0; i < gplp.n; ++i)
					bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
				if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) {
					b = calloc(1, sizeof(bcf1_t));
					bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0,
								 (conf->flag&MPLP_FMT_SP), bca, ref);
					bcf_write(bp, bh, b);
					bcf_destroy(b);
				}
			}
		} else {
			printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
			for (i = 0; i < n; ++i) {
				int j;
				printf("\t%d\t", n_plp[i]);
				if (n_plp[i] == 0) {
					printf("*\t*"); // FIXME: printf() is very slow...
					if (conf->flag & MPLP_PRINT_POS) printf("\t*");
				} else {
					for (j = 0; j < n_plp[i]; ++j)
						pileup_seq(plp[i] + j, pos, ref_len, ref);
					putchar('\t');
					for (j = 0; j < n_plp[i]; ++j) {
						const bam_pileup1_t *p = plp[i] + j;
						int c = bam1_qual(p->b)[p->qpos] + 33;
						if (c > 126) c = 126;
						putchar(c);
					}
					if (conf->flag & MPLP_PRINT_MAPQ) {
						putchar('\t');
						for (j = 0; j < n_plp[i]; ++j) {
							int c = plp[i][j].b->core.qual + 33;
							if (c > 126) c = 126;
							putchar(c);
						}
					}
					if (conf->flag & MPLP_PRINT_POS) {
						putchar('\t');
						for (j = 0; j < n_plp[i]; ++j) {
							if (j > 0) putchar(',');
							printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
						}
					}
				}
			}
			putchar('\n');
		}
	}

	bcf_close(bp);
	bam_smpl_destroy(sm); free(buf.s);
	for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
	free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
	bcf_call_del_rghash(rghash);
	bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr);
	bam_mplp_destroy(iter);
	bam_header_destroy(h);
	for (i = 0; i < n; ++i) {
		bam_close(data[i]->fp);
		if (data[i]->iter) bam_iter_destroy(data[i]->iter);
		free(data[i]);
	}
	free(data); free(plp); free(ref); free(n_plp);
	return 0;
}
Beispiel #30
0
int main_bedcov(int argc, char *argv[])
{
	extern void bam_init_header_hash(bam_header_t*);
	gzFile fp;
	kstring_t str;
	kstream_t *ks;
	bam_index_t **idx;
	bam_header_t *h = 0;
	aux_t **aux;
	int *n_plp, dret, i, n, c, min_mapQ = 0;
	int64_t *cnt;
	const bam_pileup1_t **plp;

	while ((c = getopt(argc, argv, "Q:")) >= 0) {
		switch (c) {
		case 'Q': min_mapQ = atoi(optarg); break;
		}
	}
	if (optind + 2 > argc) {
		fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n");
		return 1;
	}
	memset(&str, 0, sizeof(kstring_t));
	n = argc - optind - 1;
	aux = calloc(n, sizeof(aux_t*));
	idx = calloc(n, sizeof(bam_index_t*));
	for (i = 0; i < n; ++i) {
		aux[i] = calloc(1, sizeof(aux_t));
		aux[i]->min_mapQ = min_mapQ;
		aux[i]->fp = bam_open(argv[i+optind+1], "r");
		idx[i] = bam_index_load(argv[i+optind+1]);
		if (aux[i]->fp == 0 || idx[i] == 0) {
			fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
			return 2;
		}
		bgzf_set_cache_size(aux[i]->fp, 20);
		if (i == 0) h = bam_header_read(aux[0]->fp);
	}
	bam_init_header_hash(h);
	cnt = calloc(n, 8);

	fp = gzopen(argv[optind], "rb");
	ks = ks_init(fp);
	n_plp = calloc(n, sizeof(int));
	plp = calloc(n, sizeof(bam_pileup1_t*));
	while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
		char *p, *q;
		int tid, beg, end, pos;
		bam_mplp_t mplp;

		for (p = q = str.s; *p && *p != '\t'; ++p);
		if (*p != '\t') goto bed_error;
		*p = 0; tid = bam_get_tid(h, q); *p = '\t';
		if (tid < 0) goto bed_error;
		for (q = p = p + 1; isdigit(*p); ++p);
		if (*p != '\t') goto bed_error;
		*p = 0; beg = atoi(q); *p = '\t';
		for (q = p = p + 1; isdigit(*p); ++p);
		if (*p == '\t' || *p == 0) {
			int c = *p;
			*p = 0; end = atoi(q); *p = c;
		} else goto bed_error;

		for (i = 0; i < n; ++i) {
			if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
			aux[i]->iter = bam_iter_query(idx[i], tid, beg, end);
		}
		mplp = bam_mplp_init(n, read_bam, (void**)aux);
		bam_mplp_set_maxcnt(mplp, 64000);
		memset(cnt, 0, 8 * n);
		while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
			if (pos >= beg && pos < end)
				for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
		for (i = 0; i < n; ++i) {
			kputc('\t', &str);
			kputl(cnt[i], &str);
		}
		puts(str.s);
		bam_mplp_destroy(mplp);
		continue;

bed_error:
		fprintf(stderr, "Errors in BED line '%s'\n", str.s);
	}
	free(n_plp); free(plp);
	ks_destroy(ks);
	gzclose(fp);

	free(cnt);
	for (i = 0; i < n; ++i) {
		if (aux[i]->iter) bam_iter_destroy(aux[i]->iter);
		bam_index_destroy(idx[i]);
		bam_close(aux[i]->fp);
		free(aux[i]);
	}
	bam_header_destroy(h);
	free(aux); free(idx);
	free(str.s);
	return 0;
}