Exemple #1
0
/**
 * Reads header of a VCF file and returns the bcf header object.
 * This wraps around vcf_hdr_read from the original htslib to
 * allow for an alternative header file to be read in.
 *
 * this searches for the alternative header saved as <filename>.hdr
 */
bcf_hdr_t *bcf_alt_hdr_read(htsFile *fp)
{
    bcf_hdr_t *h = NULL;

    //check for existence of alternative header
    kstring_t alt_hdr_fn = {0, 0, 0};
    kputs(fp->fn, &alt_hdr_fn);
    kputs(".hdr", &alt_hdr_fn);
    FILE *file = fopen(alt_hdr_fn.s, "r");
    if (!file)
    {
        h = bcf_hdr_read(fp);
    }
    else
    {
        fprintf(stderr, "[I:%s:%d %s] read alternative header for %s\n", __FILE__, __LINE__, __FUNCTION__, fp->fn);
        fclose(file);
        htsFile *alt_hdr = hts_open(alt_hdr_fn.s, "r");
        h = bcf_hdr_read(alt_hdr);
        hts_close(alt_hdr);
    
        //helps move the pointer to the right place
        bcf_hdr_t *temp_h = bcf_hdr_read(fp);
        bcf_hdr_destroy(temp_h);
    }

    if (alt_hdr_fn.m) free(alt_hdr_fn.s);
    return h;
}
Exemple #2
0
static int query_chroms(char *fname)
{
    const char **seq;
    int i, nseq, ftype = file_type(fname);
    if ( ftype & IS_TXT || !ftype )
    {
        tbx_t *tbx = tbx_index_load(fname);
        if ( !tbx ) error("Could not load .tbi index of %s\n", fname);
        seq = tbx_seqnames(tbx, &nseq);
        for (i=0; i<nseq; i++)
            printf("%s\n", seq[i]);
        free(seq);
        tbx_destroy(tbx);
    }
    else if ( ftype==IS_BCF )
    {
        htsFile *fp = hts_open(fname,"r");
        if ( !fp ) error("Could not read %s\n", fname);
        bcf_hdr_t *hdr = bcf_hdr_read(fp);
        if ( !hdr ) error("Could not read the header: %s\n", fname);
        hts_close(fp);
        hts_idx_t *idx = bcf_index_load(fname);
        if ( !idx ) error("Could not load .csi index of %s\n", fname);
        seq = bcf_index_seqnames(idx, hdr, &nseq);
        for (i=0; i<nseq; i++)
            printf("%s\n", seq[i]);
        free(seq);
        bcf_hdr_destroy(hdr);
        hts_idx_destroy(idx);
    }
    else if ( ftype==IS_BAM )   // todo: BAM
        error("BAM: todo\n");
    return 0;
}
Exemple #3
0
bcf_sweep_t *bcf_sweep_init(const char *fname)
{
    bcf_sweep_t *sw = (bcf_sweep_t*) calloc(1,sizeof(bcf_sweep_t));
    sw->file = hts_open(fname, "r");
    sw->fp   = hts_get_bgzfp(sw->file);
    bgzf_index_build_init(sw->fp);
    sw->hdr  = bcf_hdr_read(sw->file);
    sw->mrec = 1;
    sw->rec  = (bcf1_t*) calloc(sw->mrec,(sizeof(bcf1_t)));
    sw->block_size = 1024*1024*3;
    sw->direction = SW_FWD;
    return sw;
}
Exemple #4
0
int main_getalt(int argc, char *argv[])
{
	int c;
	char *fn;
	BGZF *fp;
	bcf1_t *b;
	bcf_hdr_t *h;
	kstring_t s = {0,0,0};

	while ((c = getopt(argc, argv, "")) >= 0) {
	}
	if (argc - optind == 0) {
		fprintf(stderr, "Usage: bgt getalt <bgt-base>\n");
		return 1;
	}

	fn = (char*)calloc(strlen(argv[optind]) + 5, 1);
	sprintf(fn, "%s.bcf", argv[optind]);
	fp = bgzf_open(fn, "r");
	free(fn);
	assert(fp);

	h = bcf_hdr_read(fp);
	b = bcf_init1();
	while (bcf_read1(fp, b) >= 0) {
		char *ref, *alt;
		int l_ref, l_alt, i, min_l;
		bcf_get_ref_alt1(b, &l_ref, &ref, &l_alt, &alt);
		min_l = l_ref < l_alt? l_ref : l_alt;
		for (i = 0; i < min_l && ref[i] == alt[i]; ++i);
		s.l = 0;
		kputs(h->id[BCF_DT_CTG][b->rid].key, &s);
		kputc(':', &s); kputw(b->pos + 1 + i, &s);
		kputc(':', &s); kputw(b->rlen - i, &s);
		kputc(':', &s); kputsn(alt + i, l_alt - i, &s);
		puts(s.s);
	}
	bcf_destroy1(b);
	bcf_hdr_destroy(h);

	bgzf_close(fp);
	free(s.s);
	return 0;
}
Exemple #5
0
static int view_vcf(hFILE *hfp, const char *filename)
{
    vcfFile *in = hts_hopen(hfp, filename, "r");
    if (in == NULL) return 0;
    vcfFile *out = dup_stdout("w");
    bcf_hdr_t *hdr = bcf_hdr_read(in);

    if (show_headers) bcf_hdr_write(out, hdr);
    if (mode == view_all) {
        bcf1_t *rec = bcf_init();
        while (bcf_read(in, hdr, rec) >= 0)
            bcf_write(out, hdr, rec);
        bcf_destroy(rec);
    }

    bcf_hdr_destroy(hdr);
    hts_close(out);
    hts_close(in);
    return 1;
}
Exemple #6
0
int main(int argc, char **argv)
{
    if (argc != 3) {
	fprintf(stderr,"anno_setter <in.vcf.gz> <columns_string>\n");
	return 1;
    }
    bcf_hdr_t *h = NULL; //bcf_hdr_init();
    htsFile *fp = hts_open(argv[1], "r");
    if (fp == NULL)
	error("%s : %s", argv[1], strerror(errno));
    
    h = bcf_hdr_read(fp);
    if (h == NULL)
	error("failed to prase header");
    bcf_hdr_t *out = bcf_hdr_dup(h);
    char *string = strdup(argv[2]);
    int ncols = 0;
    anno_col_t *cols = init_columns(string, h, out, &ncols, anno_is_vcf);
    print_anno_cols(cols, ncols);
    hts_close(fp);
    return 0;
}
Exemple #7
0
bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
{
	kstring_t meta, smpl;
	int dret;
	vcf_t *v;
	bcf_hdr_t *h;
	if (!bp->is_vcf) return bcf_hdr_read(bp);
	h = calloc(1, sizeof(bcf_hdr_t));
	v = (vcf_t*)bp->v;
	v->line.l = 0;
	memset(&meta, 0, sizeof(kstring_t));
	memset(&smpl, 0, sizeof(kstring_t));
	while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
		if (v->line.l < 2) continue;
		if (v->line.s[0] != '#') return 0; // no sample line
		if (v->line.s[0] == '#' && v->line.s[1] == '#') {
			kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
		} else if (v->line.s[0] == '#') {
			int k;
			ks_tokaux_t aux;
			char *p;
			for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
				if (k >= 9) {
					kputsn(p, aux.p - p, &smpl);
					kputc('\0', &smpl);
				}
			}
			break;
		}
	}
	kputc('\0', &meta);
	h->name = 0;
	h->sname = smpl.s; h->l_smpl = smpl.l;
	h->txt = meta.s; h->l_txt = meta.l;
	bcf_hdr_sync(h);
	return h;
}
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
{
    htsFile* file_ptr = hts_open(fname, "r");
    if ( ! file_ptr ) {
        files->errnum = open_failed;
        return 0;
    }

    files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
    files->has_line[files->nreaders] = 0;
    files->readers  = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
    bcf_sr_t *reader = &files->readers[files->nreaders++];
    memset(reader,0,sizeof(bcf_sr_t));

    reader->file = file_ptr;

    files->errnum = 0;

    if ( files->require_index )
    {
        if ( reader->file->format.format==vcf )
        {
            if ( reader->file->format.compression!=bgzf )
            {
                files->errnum = not_bgzf;
                return 0;
            }

            reader->tbx_idx = tbx_index_load(fname);
            if ( !reader->tbx_idx )
            {
                files->errnum = idx_load_failed;
                return 0;
            }

            reader->header = bcf_hdr_read(reader->file);
        }
        else if ( reader->file->format.format==bcf )
        {
            if ( reader->file->format.compression!=bgzf )
            {
                files->errnum = not_bgzf;
                return 0;
            }

            reader->header = bcf_hdr_read(reader->file);

            reader->bcf_idx = bcf_index_load(fname);
            if ( !reader->bcf_idx )
            {
                files->errnum = idx_load_failed;
                return 0;
            }
        }
        else
        {
            files->errnum = file_type_error;
            return 0;
        }
    }
    else
    {
        if ( reader->file->format.format==bcf || reader->file->format.format==vcf )
        {
            reader->header = bcf_hdr_read(reader->file);
        }
        else
        {
            files->errnum = file_type_error;
            return 0;
        }
        files->streaming = 1;
    }
    if ( files->streaming && files->nreaders>1 )
    {
        files->errnum = api_usage_error;
        fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
        return 0;
    }
    if ( files->streaming && files->regions )
    {
        files->errnum = api_usage_error;
        fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
        return 0;
    }
    if ( !reader->header )
    {
        files->errnum = header_error;
        return 0;
    }

    reader->fname = fname;
    if ( files->apply_filters )
        reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);

    // Update list of chromosomes
    if ( !files->explicit_regs && !files->streaming )
    {
        int n,i;
        const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
        for (i=0; i<n; i++)
        {
            if ( !files->regions )
                files->regions = _regions_init_string(names[i]);
            else
                _regions_add(files->regions, names[i], -1, -1);
        }
        free(names);
    }

    return 1;
}
Exemple #9
0
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
{
	if (!fp->is_bin) {
		kstring_t txt, *s = &fp->line;
		bcf_hdr_t *h;
		h = bcf_hdr_init();
		txt.l = txt.m = 0; txt.s = 0;
		while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
			if (s->l == 0) continue;
			if (s->s[0] != '#') {
				if (hts_verbose >= 2)
					fprintf(stderr, "[E::%s] no sample line\n", __func__);
				free(txt.s);
				bcf_hdr_destroy(h);
				return 0;
			}
			if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
				int dret;
				gzFile f;
				kstream_t *ks;
				kstring_t tmp;
				tmp.l = tmp.m = 0; tmp.s = 0;
				f = gzopen(fp->fn_aux, "r");
				ks = ks_init(f);
				while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
					int c;
					kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
					ks_getuntil(ks, 0, &tmp, &dret);
					kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
					kputsn(">\n", 2, &txt);
					if (dret != '\n')
						while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
				}
				free(tmp.s);
				ks_destroy(ks);
				gzclose(f);
			}
			kputsn(s->s, s->l, &txt);
			if (s->s[1] != '#') break;
			kputc('\n', &txt);
		}
		h->l_text = txt.l + 1; // including NULL
		h->text = txt.s;
		bcf_hdr_parse(h);
        // check tabix index, are all contigs listed in the header? add the missing ones
        tbx_t *idx = tbx_index_load(fp->fn);
        if ( idx )
        {
			int i, n, need_sync = 0;
			const char **names = tbx_seqnames(idx, &n);
			for (i=0; i<n; i++)
			{
                bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]);
                if ( hrec ) continue;
                hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
                hrec->key = strdup("contig");
                bcf_hrec_add_key(hrec, "ID", strlen("ID"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
                bcf_hrec_add_key(hrec, "length", strlen("length"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0);   // what is a good default value?
                bcf_hdr_add_hrec(h, hrec);
                need_sync = 1;
			}
			free(names);
			tbx_destroy(idx);
            if ( need_sync )
            {
                bcf_hdr_sync(h);
                bcf_hdr_fmt_text(h);
            }
		}
		return h;
	} else return bcf_hdr_read((BGZF*)fp->fp);
}
Exemple #10
0
static int query_regions(args_t *args, char *fname, char **regs, int nregs)
{
    int i;
    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) error("Could not read %s\n", fname);
    enum htsExactFormat format = hts_get_format(fp)->format;

    regidx_t *reg_idx = NULL;
    if ( args->targets_fname )
    {
        reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL);
        if ( !reg_idx ) error("Could not read %s\n", args->targets_fname);
    }

    if ( format == bcf )
    {
        htsFile *out = hts_open("-","w");
        if ( !out ) error("Could not open stdout\n", fname);
        hts_idx_t *idx = bcf_index_load(fname);
        if ( !idx ) error("Could not load .csi index of %s\n", fname);
        bcf_hdr_t *hdr = bcf_hdr_read(fp);
        if ( !hdr ) error("Could not read the header: %s\n", fname);
        if ( args->print_header )
            bcf_hdr_write(out,hdr);
        if ( !args->header_only )
        {
            bcf1_t *rec = bcf_init();
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]);
                while ( bcf_itr_next(fp, itr, rec) >=0 )
                {
                    if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue;
                    bcf_write(out,hdr,rec);
                }
                tbx_itr_destroy(itr);
            }
            bcf_destroy(rec);
        }
        if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n");
        bcf_hdr_destroy(hdr);
        hts_idx_destroy(idx);
    }
    else if ( format==vcf || format==sam || format==unknown_format )
    {
        tbx_t *tbx = tbx_index_load(fname);
        if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname);
        kstring_t str = {0,0,0};
        if ( args->print_header )
        {
            while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 )
            {
                if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break;
                puts(str.s);
            }
        }
        if ( !args->header_only )
        {
            int nseq;
            const char **seq = NULL;
            if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq);
            for (i=0; i<nregs; i++)
            {
                hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]);
                if ( !itr ) continue;
                while (tbx_itr_next(fp, tbx, itr, &str) >= 0)
                {
                    if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue;
                    puts(str.s);
                }
                tbx_itr_destroy(itr);
            }
            free(seq);
        }
        free(str.s);
        tbx_destroy(tbx);
    }
    else if ( format==bam )
        error("Please use \"samtools view\" for querying BAM files.\n");

    if ( reg_idx ) regidx_destroy(reg_idx);
    if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname);

    for (i=0; i<nregs; i++) free(regs[i]);
    free(regs);
    return 0;
}
Exemple #11
0
static void concat(args_t *args)
{
    int i;
    if ( args->phased_concat )  // phased concat
    {
        // keep only two open files at a time
        while ( args->ifname < args->nfnames )
        {
            int new_file = 0;
            while ( args->files->nreaders < 2 && args->ifname < args->nfnames )
            {
                if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
                new_file = 1;

                args->ifname++;
                if ( args->start_pos[args->ifname-1]==-1 ) break;   // new chromosome, start with only one file open
                if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome
            }

            // is there a line from the previous run? Seek the newly opened reader to that position
            int seek_pos = -1;
            int seek_chr = -1;
            if ( bcf_sr_has_line(args->files,0) )
            {
                bcf1_t *line = bcf_sr_get_line(args->files,0);
                bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos);
                seek_pos = line->pos;
                seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line));
            }
            else if ( new_file )
                bcf_sr_seek(args->files,NULL,0);  // set to start

            int nret;
            while ( (nret = bcf_sr_next_line(args->files)) )
            {
                if ( !bcf_sr_has_line(args->files,0) )  // no input from the first reader
                {
                    // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
                    if ( ! bcf_sr_region_done(args->files,0) ) continue;

                    phased_flush(args);
                    bcf_sr_remove_reader(args->files, 0);
                }

                // Get a line to learn about current position
                for (i=0; i<args->files->nreaders; i++)
                    if ( bcf_sr_has_line(args->files,i) ) break;
                bcf1_t *line = bcf_sr_get_line(args->files,i);

                // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
                if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue;
                seek_pos = seek_chr = -1;

                //  Check if the position overlaps with the next, yet unopened, reader
                int must_seek = 0;
                while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] )
                {
                    must_seek = 1;
                    if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
                    args->ifname++;
                }
                if ( must_seek )
                {
                    bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos);
                    seek_pos = line->pos;
                    seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line));
                    continue;
                }

                // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
                if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue;

                phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL);
            }

            if ( args->files->nreaders )
            {
                phased_flush(args);
                while ( args->files->nreaders )
                    bcf_sr_remove_reader(args->files, 0);
            }
        }
    }
    else if ( args->files )  // combining overlapping files, using synced reader
    {
        while ( bcf_sr_next_line(args->files) )
        {
            for (i=0; i<args->files->nreaders; i++)
            {
                bcf1_t *line = bcf_sr_get_line(args->files,i);
                if ( !line ) continue;
                bcf_translate(args->out_hdr, args->files->readers[i].header, line);
                bcf_write1(args->out_fh, args->out_hdr, line);
                if ( args->remove_dups ) break;
            }
        }
    }
    else    // concatenating
    {
        kstring_t tmp = {0,0,0};
        int prev_chr_id = -1, prev_pos;
        bcf1_t *line = bcf_init();
        for (i=0; i<args->nfnames; i++)
        {
            htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
            bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
            if ( !fp->is_bin && args->output_type&FT_VCF )
            {
                line->max_unpack = BCF_UN_STR;
                // if VCF is on both input and output, avoid VCF to BCF conversion
                while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
                {
                    char *str = fp->line.s;
                    while ( *str && *str!='\t' ) str++;
                    tmp.l = 0;
                    kputsn(fp->line.s,str-fp->line.s,&tmp);
                    int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s);
                    if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]);
                    if ( prev_chr_id!=chr_id )
                    {
                        prev_pos = -1;
                        if ( args->seen_seq[chr_id] )
                            error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s);
                    }
                    char *end;
                    int pos = strtol(str+1,&end,10) - 1;
                    if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s);
                    if ( prev_pos > pos )
                        error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s);
                    args->seen_seq[chr_id] = 1;
                    prev_chr_id = chr_id;

                    if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l);
                }
            }
            else
            {
                // BCF conversion is required
                line->max_unpack = 0;
                while ( bcf_read(fp, hdr, line)==0 )
                {
                    bcf_translate(args->out_hdr, hdr, line);

                    if ( prev_chr_id!=line->rid )
                    {
                        prev_pos = -1;
                        if ( args->seen_seq[line->rid] )
                            error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
                    }
                    if ( prev_pos > line->pos )
                        error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
                    args->seen_seq[line->rid] = 1;
                    prev_chr_id = line->rid;

                    if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n");
                }
            }
            bcf_hdr_destroy(hdr);
            hts_close(fp);
        }
        bcf_destroy(line);
        free(tmp.s);
    }
}
Exemple #12
0
int ctx_vcfcov(int argc, char **argv)
{
  struct MemArgs memargs = MEM_ARGS_INIT;
  const char *out_path = NULL, *out_type = NULL;

  uint32_t max_allele_len = 0, max_gt_vars = 0;
  char *ref_path = NULL;
  bool low_mem = false;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;
  size_t i;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'O': cmd_check(!out_type, cmd); out_type = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'm': cmd_mem_args_set_memory(&memargs, optarg); break;
      case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break;
      case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break;
      case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break;
      case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break;
      case 'M': cmd_check(!low_mem, cmd); low_mem = true; break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        // cmd_print_usage(NULL);
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: abort();
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)");
  if(optind+2 > argc) cmd_print_usage("Require VCF and graph files");

  if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN;
  if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS;

  status("[vcfcov] max allele length: %u; max number of variants: %u",
         max_allele_len, max_gt_vars);

  // open ref
  // index fasta with: samtools faidx ref.fa
  faidx_t *fai = fai_load(ref_path);
  if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path);

  // Open input VCF file
  const char *vcf_path = argv[optind++];
  htsFile *vcffh = hts_open(vcf_path, "r");
  if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path);
  bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh);
  if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path);

  // Test we can close and reopen files
  if(low_mem) {
    if((vcffh = hts_open(vcf_path, "r")) == NULL)
      die("Cannot re-open VCF file: %s", vcf_path);
    if((vcfhdr = bcf_hdr_read(vcffh)) == NULL)
      die("Cannot re-read VCF header: %s", vcf_path);
  }

  //
  // Open graph files
  //
  const size_t num_gfiles = argc - optind;
  char **graph_paths = argv + optind;
  ctx_assert(num_gfiles > 0);

  GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader));
  size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0;

  ncols = graph_files_open(graph_paths, gfiles, num_gfiles,
                           &ctx_max_kmers, &ctx_sum_kmers);

  // Check graph + paths are compatible
  graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1);

  //
  // Decide on memory
  //
  size_t bits_per_kmer, kmers_in_hash, graph_mem;

  bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols;
  kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use,
                                        memargs.mem_to_use_set,
                                        memargs.num_kmers,
                                        memargs.num_kmers_set,
                                        bits_per_kmer,
                                        low_mem ? -1 : (int64_t)ctx_max_kmers,
                                        ctx_sum_kmers,
                                        true, &graph_mem);

  cmd_check_mem_limit(memargs.mem_to_use, graph_mem);

  //
  // Open output file
  //
  // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf
  int mode = vcf_misc_get_outtype(out_type, out_path);
  futil_create_output(out_path);
  htsFile *outfh = hts_open(out_path, modes_htslib[mode]);
  status("[vcfcov] Output format: %s", hsmodes_htslib[mode]);


  // Allocate memory
  dBGraph db_graph;
  db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash,
                 DBG_ALLOC_COVGS);

  //
  // Set up tag names
  //

  // *R => ref, *A => alt
  sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage
  sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size);

  // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK>
  // - K29_kcov is empirical kmer coverage
  // - K29_nkmers is the number of kmers in the sample
  // - mean_read_length is the mean read length in bases
  char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20];
  sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage
  sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size);
  sprintf(sample_rlk_tag, "mean_read_length");

  //
  // Load kmers if we are using --low-mem
  //

  VcfCovStats st;
  memset(&st, 0, sizeof(st));
  VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag,
                       .kcov_alt_tag = kcov_alt_tag,
                       .max_allele_len = max_allele_len,
                       .max_gt_vars = max_gt_vars,
                       .load_kmers_only = false};

  if(low_mem)
  {
    status("[vcfcov] Loading kmers from VCF+ref");

    prefs.load_kmers_only = true;
    vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai,
                NULL, &prefs, &st, &db_graph);

    // Close files
    hts_close(vcffh);
    bcf_hdr_destroy(vcfhdr);

    // Re-open files
    if((vcffh = hts_open(vcf_path, "r")) == NULL)
      die("Cannot re-open VCF file: %s", vcf_path);
    if((vcfhdr = bcf_hdr_read(vcffh)) == NULL)
      die("Cannot re-read VCF header: %s", vcf_path);

    prefs.load_kmers_only = false;
  }

  //
  // Load graphs
  //
  GraphLoadingStats gstats;
  memset(&gstats, 0, sizeof(gstats));

  GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph);
  gprefs.must_exist_in_graph = low_mem;

  for(i = 0; i < num_gfiles; i++) {
    graph_load(&gfiles[i], gprefs, &gstats);
    graph_file_close(&gfiles[i]);
  }
  ctx_free(gfiles);

  hash_table_print_stats(&db_graph.ht);

  //
  // Set up VCF header / graph matchup
  //
  size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t));

  // Add samples to vcf header
  bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr);
  bcf_hrec_t *hrec;
  int sid;
  char hdrstr[200];

  for(i = 0; i < db_graph.num_of_cols; i++) {
    char *sname = db_graph.ginfo[i].sample_name.b;
    if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) {
      bcf_hdr_add_sample(outhdr, sname);
      sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname);
    }
    samplehdrids[i] = sid;

    // Add SAMPLE field
    hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE");

    if(hrec == NULL) {
      sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname,
              sample_kcov_tag,
              gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0,
              sample_nk_tag, gstats.nkmers[i],
              sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length);
      bcf_hdr_append(outhdr, hdrstr);
    }
    else {
      // mean kcovg
      sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]);
      vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr);
      // num kmers
      sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]);
      vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr);
      // mean read length in kmers
      sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length);
      vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr);
    }

    status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]);
  }

  // Add genotype format fields
  // One field per alternative allele

  sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer,"
          "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n",
          kcov_ref_tag, db_graph.kmer_size);
  bcf_hdr_append(outhdr, hdrstr);
  sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer,"
          "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n",
          kcov_alt_tag, db_graph.kmer_size);
  bcf_hdr_append(outhdr, hdrstr);

  bcf_hdr_set_version(outhdr, "VCFv4.2");

  // Add command string to header
  vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd());

  if(bcf_hdr_write(outfh, outhdr) != 0)
    die("Cannot write header to: %s", futil_outpath_str(out_path));

  status("[vcfcov] Reading %s and adding coverage", vcf_path);

  // Reset stats and get coverage
  memset(&st, 0, sizeof(st));

  vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai,
              samplehdrids, &prefs, &st, &db_graph);

  // Print statistics
  char ns0[50], ns1[50];
  status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0));
  status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0));
  status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0));
  status("[vcfcov] ALTs used: %s / %s (%.2f%%)",
         ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0);
  status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len,
         ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0);
  status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)",
         max_gt_vars, db_graph.kmer_size,
         ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0);
  status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)",
         ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1),
         st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0);

  status("[vcfcov] Saved to: %s\n", out_path);

  ctx_free(samplehdrids);
  graph_loading_stats_destroy(&gstats);

  bcf_hdr_destroy(vcfhdr);
  bcf_hdr_destroy(outhdr);
  hts_close(vcffh);
  hts_close(outfh);
  fai_destroy(fai);
  db_graph_dealloc(&db_graph);

  return EXIT_SUCCESS;
}
int ingest1(const char *input,const char *output,char *ref,bool exit_on_mismatch=true) {
  cerr << "Input: " << input << "\tOutput: "<<output<<endl;

  kstream_t *ks;
  kstring_t str = {0,0,0};    
  gzFile fp = gzopen(input, "r");
  VarBuffer vbuf(1000);
  int prev_rid = -1;
  if(fp==NULL) {
    fprintf(stderr,"problem opening %s\n",input);
    exit(1);
  }

  char *out_fname = (char *)malloc(strlen(output)+5);
  strcpy(out_fname,output);
  strcat(out_fname,".tmp");
  if(fileexists(out_fname)) {
    fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname);
    exit(1);
  }
  printf("depth: %s\n",out_fname);
  gzFile depth_fp = gzopen(out_fname, "wb1");
  strcpy(out_fname,output);
  strcat(out_fname,".bcf");
  if(fileexists(out_fname)) {
    fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname);
    exit(1);
  }
  printf("variants: %s\n",out_fname);
  htsFile *variant_fp=hts_open(out_fname,"wb1");
  if(variant_fp==NULL) {
    fprintf(stderr,"problem opening %s\n",input);
    exit(1);    
  }

  ks = ks_init(fp);
  htsFile *hfp=hts_open(input, "r");
  bcf_hdr_t *hdr_in =  bcf_hdr_read(hfp);
  hts_close(hfp);
  //this is a hack to fix gvcfs where AD is incorrectly defined in the header. (vcf4.2 does not technically allow Number=R)
  bcf_hdr_remove(hdr_in,BCF_HL_FMT,"AD");
  assert(  bcf_hdr_append(hdr_in,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)\">") == 0);

  //this is a hack to fix broken gvcfs where GQ is incorrectly labelled as float (v4.3 spec says it should be integer)
  bcf_hdr_remove(hdr_in,BCF_HL_FMT,"GQ");
  assert(  bcf_hdr_append(hdr_in,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">") == 0);


  //  bcf_hdr_t  *hdr_out=hdr_in;
  bcf_hdr_t *hdr_out =  bcf_hdr_dup(hdr_in);
  remove_hdr_lines(hdr_out,BCF_HL_INFO);
  remove_hdr_lines(hdr_out,BCF_HL_FLT);
  bcf_hdr_sync(hdr_out);

  //here we add FORMAT/PF. which is the pass filter flag for alts.
  assert(  bcf_hdr_append(hdr_out,"##FORMAT=<ID=PF,Number=A,Type=Integer,Description=\"variant was PASS filter in original sample gvcf\">") == 0);

  args_t *norm_args = init_vcfnorm(hdr_out,ref);
  norm_args->check_ref |= CHECK_REF_WARN;
  bcf1_t *bcf_rec = bcf_init();
  bcf_hdr_write(variant_fp, hdr_out);
  kstring_t work1 = {0,0,0};            
  int buf[5];
  ks_tokaux_t aux;
  int ndec=0;
  int ref_len,alt_len;
  while(    ks_getuntil(ks, '\n', &str, 0) >=0) {
    //    fprintf(stderr,"%s\n",str.s);
    if(str.s[0]!='#')  {
      char *ptr = kstrtok(str.s,"\t",&aux);//chrom
      ptr = kstrtok(NULL,NULL,&aux);//pos
      work1.l=0;
      kputsn(str.s,ptr-str.s-1, &work1);   
      buf[0] =  bcf_hdr_name2id(hdr_in, work1.s);
      assert(      buf[0]>=0);
      buf[1]=atoi(ptr)-1;
      ptr = kstrtok(NULL,NULL,&aux);//ID
      ptr = kstrtok(NULL,NULL,&aux);//REF

      ref_len=0;
      while(ptr[ref_len]!='\t') ref_len++;

      ptr = kstrtok(NULL,NULL,&aux);//ALT

      bool is_variant=false;
      alt_len=0;
      while(ptr[alt_len]!='\t') alt_len++;
      if(ptr[0]!='.') 
	is_variant=true;

      char * QUAL_ptr = kstrtok(NULL, NULL, &aux);
      assert (QUAL_ptr != NULL);
      
      for(int i=0;i<2;i++)  ptr = kstrtok(NULL,NULL,&aux);// gets us to INFO

      //find END if it is there
      char *end_ptr=strstr(ptr,"END=") ;
      if(end_ptr!=NULL) 
	buf[2]=atoi(end_ptr+4)-1;
      else
	buf[2]=buf[1]+alt_len-1;

      ptr  = kstrtok(NULL,NULL,&aux);//FORMAT
      //find index of DP (if present)
      //if not present, dont output anything (indels ignored)

      char *DP_ptr = find_format(ptr,"DP");
      int GQX = 0;
      int QUAL = 0;

      // AH: change code to use the minimum of GQ and QUAL fields if
      // GQX is not defined. See here:
      // https://support.basespace.illumina.com/knowledgebase/articles/144844-vcf-file
      // "GQXGenotype quality. GQX is the minimum of the GQ value
      // and the QUAL column. In general, these are similar values;
      // taking the minimum makes GQX the more conservative measure of
      // genotype quality."
      if(DP_ptr!=NULL) {
	buf[3]=atoi(DP_ptr);
	char *GQX_ptr = find_format(ptr,"GQX");
	if (GQX_ptr == NULL) 
	  {
	    GQX_ptr = find_format(ptr,"GQ");
	    GQX = atoi(GQX_ptr);
	    if (QUAL_ptr[0] != '.') 
	      {
		QUAL = atoi(QUAL_ptr);
		if (QUAL < GQX)
		  GQX = QUAL;
	      }
	  }
	else
	  {
	    GQX = atoi(GQX_ptr);
	  }
	
	//trying to reduce entropy on GQ to get better compression performance.
	//1. rounds down to nearest 10. 
	//2. sets gq to min(gq,100). 
	buf[4]=GQX/10;
	buf[4]*=10;
	if(buf[4]>100) buf[4]=100;

	//	printf("%d\t%d\t%d\t%d\t%d\n",buf[0],buf[1],buf[2],buf[3],buf[4]);
	if(gzwrite(depth_fp,buf,5*sizeof(int))!=(5*sizeof(int)))
	  die("ERROR: problem writing "+(string)out_fname+".tmp");
      }
      if(is_variant) {//wass this a variant? if so write it out to the bcf
	norm_args->ntotal++;
	vcf_parse(&str,hdr_in,bcf_rec);
	//	cerr<<bcf_rec->rid<<":"<<bcf_rec->pos<<endl;
	if(prev_rid!=bcf_rec->rid) 
	  vbuf.flush(variant_fp,hdr_out);
	else
	  vbuf.flush(bcf_rec->pos,variant_fp,hdr_out);
	prev_rid=bcf_rec->rid;
	int32_t pass = bcf_has_filter(hdr_in, bcf_rec, ".");
	bcf_update_format_int32(hdr_out,bcf_rec,"PF",&pass,1);
	bcf_update_filter(hdr_out,bcf_rec,NULL,0);
	if(bcf_rec->n_allele>2) {//split multi-allelics (using vcfnorm.c from bcftools1.3
	  norm_args->nsplit++;
	  split_multiallelic_to_biallelics(norm_args,bcf_rec );
	  for(int i=0;i<norm_args->ntmp_lines;i++){
	    remove_info(norm_args->tmp_lines[i]);
	    if(realign(norm_args,norm_args->tmp_lines[i]) != ERR_REF_MISMATCH)
	      ndec+=decompose(norm_args->tmp_lines[i],hdr_out,vbuf);
	    else
	      if(exit_on_mismatch)
		die("vcf did not match the reference");
	      else
		norm_args->nskipped++;
	  }
	}
	else {
	  remove_info(bcf_rec);
	  if( realign(norm_args,bcf_rec) !=  ERR_REF_MISMATCH)
	    ndec+=decompose(bcf_rec,hdr_out,vbuf);
	  else
	    if(exit_on_mismatch)
	      die("vcf did not match the reference");
	    else
	      norm_args->nskipped++;
	}
	vbuf.flush(bcf_rec->pos,variant_fp,hdr_out);
      }
    }
  }
  vbuf.flush(variant_fp,hdr_out);
  bcf_hdr_destroy(hdr_in);
  bcf_hdr_destroy(hdr_out);
  bcf_destroy1(bcf_rec);
  ks_destroy(ks);
  gzclose(fp);
  gzclose(depth_fp);  
  free(str.s);
  free(work1.s);
  hts_close(variant_fp);
  destroy_data(norm_args);
  fprintf(stderr,"Variant lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", norm_args->ntotal,norm_args->nsplit,norm_args->nchanged,norm_args->nskipped);
  fprintf(stderr,"Decomposed %d MNPs\n", ndec);


  fprintf(stderr,"Indexing %s\n",out_fname);
  bcf_index_build(out_fname, BCF_LIDX_SHIFT);
  free(out_fname);
  return 0;
}
Exemple #14
0
int main(int argc, char* argv[]) {
    namespace po = boost::program_options;

    std::string file;
    std::string output;

    try
    {
        // Declare the supported options.
        po::options_description desc("Allowed options");
        desc.add_options()
            ("help,h", "produce help message")
            ("version", "Show version")            
            ("input-file", po::value< std::string >(), "The input files")
            ("output-file", po::value<std::string>(), "The output file name.")
        ;

        po::positional_options_description popts;
        popts.add("input-file", 1);
        popts.add("output-file", 1);

        po::options_description cmdline_options;
        cmdline_options
            .add(desc)
        ;

        po::variables_map vm;
        
        po::store(po::command_line_parser(argc, argv).
                  options(cmdline_options).positional(popts).run(), vm);
        po::notify(vm); 

        if (vm.count("version")) 
        {
            std::cout << "vcfhdr2json version " << HAPLOTYPES_VERSION << "\n";
            return 0;
        }

        if (vm.count("help")) 
        {
            std::cout << desc << "\n";
            return 1;
        }

        if (vm.count("input-file"))
        {
            file = vm["input-file"].as< std::string > ();
        }

        if (vm.count("output-file"))
        {
            output = vm["output-file"].as< std::string >();
        }

        if(file.size() == 0)
        {
            std::cerr << "Please specify an input file.\n";
            return 1;
        }

        if (output == "")
        {
            std::cerr << "Please specify an output file.\n";
            return 1; 
        }
    } 
    catch (po::error & e)
    {
        std::cerr << e.what() << "\n";
        return 1;
    }

    try
    {
        Json::StyledWriter writer;
        htsFile * fp = bcf_open(file.c_str(), "r");
        bcf_hdr_t * hdr = bcf_hdr_read(fp);

        Json::Value root;
        Json::Value a;
        for (int i = 0; i < bcf_hdr_nsamples(hdr); ++i)
        {
            a.append(hdr->samples[i]);
        }
        root["samples"] = a;

        Json::Value fields;
        for (int i = 0; i < hdr->nhrec; i++)
        {
            Json::Value field;
            field["key"] = hdr->hrec[i]->key;
            if (!hdr->hrec[i]->value)
            {
                Json::Value values;

                for (int j = 0; j < hdr->hrec[i]->nkeys; j++)
                {
                    values[hdr->hrec[i]->keys[j]] = hdr->hrec[i]->vals[j];
                }
                field["values"] = values;
            }
            else
            {
                field["value"] = hdr->hrec[i]->value;
            }
            fields.append(field);
        }
        root["fields"] = fields;

        tbx_t * tbx_idx = tbx_index_load(file.c_str());
        if ( !tbx_idx )
        {
            hts_idx_t * csi_idx = bcf_index_load(file.c_str());
            if(!csi_idx)
            {
                root["tabix"] = Json::Value::null;
            }
            else
            {
                root["tabix"] = Json::Value();
                root["tabix"]["chromosomes"] = Json::Value();

                int count = 0;
                const char ** tbx_names = bcf_index_seqnames(csi_idx, hdr, &count);

                for (int i = 0; i < count; ++i)
                {
                    root["tabix"]["chromosomes"].append(tbx_names[i]);
                }
                free(tbx_names);
                hts_idx_destroy(csi_idx);
            }
        }
        else
        {
            root["tabix"] = Json::Value();
            root["tabix"]["chromosomes"] = Json::Value();

            int count = 0;
            const char ** tbx_names = tbx_seqnames(tbx_idx, &count);

            for (int i = 0; i < count; ++i)
            {
                root["tabix"]["chromosomes"].append(tbx_names[i]);
            }

            free(tbx_names);
            tbx_destroy(tbx_idx);
        }


        std::ofstream out(output.c_str());
        out << writer.write(root);

        bcf_close(fp);
        bcf_hdr_destroy(hdr);
    } 
    catch(std::runtime_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }
    catch(std::logic_error & e)
    {
        std::cerr << e.what() << std::endl;
        return 1;
    }

    return 0;
}
static void reheader_bcf(args_t *args, int is_compressed)
{
    htsFile *fp = hts_open(args->fname, "r"); if ( !fp ) error("Failed to open: %s\n", args->fname);
    bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname);
    kstring_t htxt = {0,0,0};
    int hlen;
    htxt.s = bcf_hdr_fmt_text(hdr, 1, &hlen);
    htxt.l = hlen;

    int i, nsamples = 0;
    char **samples = NULL;
    if ( args->samples_fname )
        samples = hts_readlines(args->samples_fname, &nsamples);
    if ( args->header_fname )
    {
        free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0;
        read_header_file(args->header_fname, &htxt);
    }
    if ( samples )
    {
        set_samples(samples, nsamples, &htxt);
        for (i=0; i<nsamples; i++) free(samples[i]);
        free(samples);
    }

    bcf_hdr_t *hdr_out = bcf_hdr_init("r");
    bcf_hdr_parse(hdr_out, htxt.s);
    if ( args->header_fname ) hdr_out = strip_header(hdr, hdr_out);

    // write the header and the body
    htsFile *fp_out = hts_open("-",is_compressed ? "wb" : "wbu");
    bcf_hdr_write(fp_out, hdr_out);

    bcf1_t *rec = bcf_init();
    while ( bcf_read(fp, hdr, rec)==0 )
    {
        // sanity checking, this slows things down. Make it optional?
        bcf_unpack(rec, BCF_UN_ALL);
        if ( rec->rid >= hdr_out->n[BCF_DT_CTG] || strcmp(bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid),bcf_hdr_int2id(hdr_out,BCF_DT_CTG,rec->rid)) )
            error("The CHROM is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid));

        for (i=0; i<rec->d.n_flt; i++)
        {
            int id = rec->d.flt[i];
            if ( id >= hdr_out->n[BCF_DT_ID] ) break;
            if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FLT,id) ) break;
            if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) )
                error("FIXME: Broken FILTER ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key);
        }
        if ( i!=rec->d.n_flt )
            error("The FILTER is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.flt[i]));

        for (i=0; i<rec->n_info; i++)
        {
            int id = rec->d.info[i].key;
            if ( id >= hdr_out->n[BCF_DT_ID] ) break;
            if ( !hdr_out->id[BCF_DT_ID][id].key ) break;
            if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_INFO,id) ) break;
            if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) )
                error("FIXME: Broken INFO ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key);
        }
        if ( i!=rec->n_info )
            error("The INFO tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.info[i].key));

        for (i=0; i<rec->n_fmt; i++)
        {
            int id = rec->d.fmt[i].id;
            if ( id >= hdr_out->n[BCF_DT_ID] ) break;
            if ( !hdr_out->id[BCF_DT_ID][id].key ) break;
            if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FMT,id) ) break;
            if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) )
                error("FIXME: Broken FORMAT ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key);
        }
        if ( i!=rec->n_fmt )
            error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id));

        bcf_write(fp_out,hdr_out,rec);
    }
    bcf_destroy(rec);

    free(htxt.s);
    hts_close(fp_out);
    hts_close(fp);
    bcf_hdr_destroy(hdr_out);
    bcf_hdr_destroy(hdr);
}
Exemple #16
0
int main(int argc, char **argv)
{
    if ( argc == 1 )
	error("Usage : bed_annos -c config.json -O z -o output.vcf.gz input.vcf.gz");
    int i;
    for ( i = 1; i < argc; ) {
	const char *a = argv[i++];
	const char **var = 0;
	if ( strcmp(a, "-c") == 0 )
	    var = &json_fname;
	else if ( strcmp(a, "-O") == 0 )
	    var = &output_fname_type;
	else if ( strcmp(a, "-o") == 0 )
	    var = &output_fname;

	if ( var != 0 ) {
	    if ( i == argc )
		error("Missing an argument after %s", a);
	    *var = argv[i++];
	    continue;
	}

	if ( input_fname == 0 ) {
	    input_fname = a;
	    continue;
	}

	error("Unknown argument : %s.", a);
    }

    struct vcfanno_config *con = vcfanno_config_init();
    if ( vcfanno_load_config(con, json_fname) != 0 )
	error("Failed to load configure file. %s : %s", json_fname, strerror(errno));
    vcfanno_config_debug(con);
    if ( con->beds.n_beds == 0)
	error("No bed database specified.");
    if ( input_fname == 0 && (!isatty(fileno(stdin))) )
	input_fname = "-";
    if ( input_fname == 0 )
	error("No input file.");

    int out_type = FT_VCF;
    if ( output_fname_type != 0 ) {
	switch (output_fname_type[0]) {
	    case 'b':
		out_type = FT_BCF_GZ; break;
	    case 'u':
		out_type = FT_BCF; break;
	    case 'z':
		out_type = FT_VCF_GZ; break;
	    case 'v':
		out_type = FT_VCF; break;
	    default :
		error("The output type \"%d\" not recognised\n", out_type);
	};
    }

    htsFile *fp = hts_open(input_fname, "r");
    if ( fp == NULL )
	error("Failed to open %s : %s.", input_fname, strerror(errno));
    htsFormat type = *hts_get_format(fp);
    if ( type.format != vcf && type.format != bcf )
	error("Unsupported input format. %s", input_fname);
    
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    if ( hdr == NULL )
	error("Failed to parse header.");	
    bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr);    
    htsFile *fout = output_fname == 0 ? hts_open("-", hts_bcf_wmode(out_type)) : hts_open(output_fname, hts_bcf_wmode(out_type));
    struct beds_options opts = { .beds_is_inited = 0,};
    beds_options_init(&opts);
    opts.hdr_out = hdr_out;

    for ( i = 0; i < con->beds.n_beds; ++i ) {
	beds_database_add(&opts, con->beds.files[i].fname, con->beds.files[i].columns);
    }

    bcf_hdr_write(fout, hdr_out);
    bcf1_t *line = bcf_init();
    while ( bcf_read(fp, hdr, line) == 0 ) {
	anno_beds_core(&opts, line);
	bcf_write(fout, hdr_out, line);
    }
    bcf_destroy(line);
    bcf_hdr_destroy(hdr);
    bcf_hdr_destroy(hdr_out);
    beds_options_destroy(&opts);
    hts_close(fp);
    hts_close(fout);
    return 0;

}
Exemple #17
0
int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
{
    files->has_line = (int*) realloc(files->has_line, sizeof(int)*(files->nreaders+1));
    files->has_line[files->nreaders] = 0;
    files->readers  = (bcf_sr_t*) realloc(files->readers, sizeof(bcf_sr_t)*(files->nreaders+1));
    bcf_sr_t *reader = &files->readers[files->nreaders++];
    memset(reader,0,sizeof(bcf_sr_t));

    reader->file = hts_open(fname, "r");
    if ( !reader->file ) return 0;

    reader->type = reader->file->is_bin? FT_BCF : FT_VCF;
    if (reader->file->is_compressed) reader->type |= FT_GZ;

    if ( files->require_index )
    {
        if ( reader->type==FT_VCF_GZ ) 
        {
            reader->tbx_idx = tbx_index_load(fname);
            if ( !reader->tbx_idx )
            {
                fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
                return 0;
            }

            reader->header = bcf_hdr_read(reader->file);
        }
        else if ( reader->type==FT_BCF_GZ ) 
        {
            reader->header = bcf_hdr_read(reader->file);

            reader->bcf_idx = bcf_index_load(fname);
            if ( !reader->bcf_idx ) 
            {
                fprintf(stderr,"[add_reader] Could not load the index of %s\n", fname);
                return 0;   // not indexed..?
            }
        }
        else
        {
            fprintf(stderr,"Index required, expected .vcf.gz or .bcf file: %s\n", fname);
            return 0;
        }
    }
    else 
    {
        if ( reader->type & FT_BCF )
        {
            reader->header = bcf_hdr_read(reader->file);
        }
        else if ( reader->type & FT_VCF )
        {
            reader->header = bcf_hdr_read(reader->file);
        }
        else
        {
            fprintf(stderr,"File type not recognised: %s\n", fname);
            return 0;
        }
        files->streaming = 1;
    }
    if ( files->streaming && files->nreaders>1 )
    {
        fprintf(stderr,"[%s:%d %s] Error: %d readers, yet require_index not set\n", __FILE__,__LINE__,__FUNCTION__,files->nreaders);
        return 0;
    }
    if ( files->streaming && files->regions )
    {
        fprintf(stderr,"[%s:%d %s] Error: cannot tabix-jump in streaming mode\n", __FILE__,__LINE__,__FUNCTION__);
        return 0;
    }
    if ( !reader->header ) return 0;

    reader->fname = fname;
    if ( files->apply_filters )
        reader->filter_ids = init_filters(reader->header, files->apply_filters, &reader->nfilter_ids);

    // Update list of chromosomes
    if ( !files->explicit_regs && !files->streaming )
    {
        int n,i;
        const char **names = reader->tbx_idx ? tbx_seqnames(reader->tbx_idx, &n) : bcf_hdr_seqnames(reader->header, &n);
        for (i=0; i<n; i++)
        {
            if ( !files->regions )
                files->regions = _regions_init_string(names[i]);
            else
                _regions_add(files->regions, names[i], -1, -1);
        }
        free(names);
    }

    return 1;
}
Exemple #18
0
//{{{ void get_bcf_query_result(uint32_t *mask,
void get_bcf_query_result(uint32_t *mask,
                        uint32_t mask_len,
                        struct gqt_query *q,
                        char **id_query_list,
                        uint32_t *id_lens,
                        uint32_t num_qs,
                        uint32_t num_fields,
                        char *vid_file_name,
                        char *bcf_file_name,
                        int bcf_output)
{

    /* The VID file contains the line numbers of the variants after they have
     * been sorted.  To reach back into the BCF file to print the metadata
     * associated with the variants marked in the mask, we need to create a
     * sorted list of line numbers we want.  So first we intersect the VID file
     * and the mask, then sort it.
     */
    /*
    FILE *vid_f = fopen(vid_file_name, "rb");
    if (!vid_f)
        err(EX_NOINPUT, "Cannot read file\"%s\"", vid_file_name);

    uint32_t *vids = (uint32_t *) malloc(num_fields*sizeof(uint32_t));
    if (!vids )
        err(EX_OSERR, "malloc error");

    size_t fr = fread(vids, sizeof(uint32_t), num_fields, vid_f);
    check_file_read(vid_file_name, vid_f, num_fields, fr);

    fclose(vid_f);
    */
    struct vid_file *vid_f = open_vid_file(vid_file_name);
    load_vid_data(vid_f);

    uint32_t i, j, masked_vid_count = 0;

    for (i = 0; i < mask_len; ++i)
        masked_vid_count += popcount(mask[i]);

    uint32_t *masked_vids = (uint32_t *)
            malloc(masked_vid_count*sizeof(uint32_t));
    if (!masked_vids )
        err(EX_OSERR, "malloc error");
    uint32_t masked_vid_i = 0;

    for (i = 0; i < mask_len; ++i) {
        uint32_t bytes = mask[i];
	if (bytes == 0)
            continue; /* skip a bunch of ops if you can */
        for (j = 0; j < 32; j++) {
            if (bytes & (1 << (31 - j))) {
                masked_vids[masked_vid_i] = vid_f->vids[i*32 + j];
                masked_vid_i+=1;
            }
        }
        if (masked_vid_i == masked_vid_count)
            break;
    }

    destroy_vid_file(vid_f);

    qsort(masked_vids, masked_vid_count, sizeof(uint32_t), compare_uint32_t);

    htsFile *fp    = hts_open(bcf_file_name,"rb");
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    bcf1_t *line    = bcf_init1();
    //bcf_hdr_set_samples(hdr, print_name_csv, 0);

    htsFile *out;
    if (!bcf_output)
        out = hts_open("-", "w");
    else
        out = hts_open("-", "wb");

    int r = bcf_hdr_write(out, hdr);

    uint32_t bcf_line_i = 0;
    masked_vid_i = 0;
    while ( bcf_read(fp, hdr, line) != -1) {
        if (masked_vids[masked_vid_i] == bcf_line_i) {
            r = bcf_unpack(line, BCF_UN_ALL);
            r = bcf_write1(out, hdr, line);
            masked_vid_i+=1;
        }
        if (masked_vid_i == masked_vid_count)
            break;
        bcf_line_i += 1;
    }

    hts_close(out);
    hts_close(fp);
}
Exemple #19
0
int main(int argc, char **argv)
{
    int i, n;
    static struct option const long_opts[] =
    {
	{"out", required_argument, NULL, 1},
	{"report", required_argument, NULL, 2},
	{"dotasref", no_argument, NULL, 3},
	{"help", no_argument, NULL, 0},
	{"version", no_argument, NULL, 4},
	{"export_uncov", no_argument, NULL, 5}
    };
    bool help = FALSE;
    bool report_version = FALSE;
    while ((n = getopt_long(argc, argv, "1:2:304", long_opts, NULL)) >= 0)
    {
	switch (n)
	{
	case 1 : outfile = strdup(optarg); break;
	case 2 : report = strdup(optarg); break;
	case 3 : dotasref = TRUE; break;
	case 0 : help = TRUE; break;
	case 4 : report_version = TRUE; break;
	case 5 : export_uncover = TRUE; break;
	default : return 1;
	}
	if ( help ) return usage();
	if ( report_version ) return show_version();
    }
    n = argc - optind;
    if ( n > 1 ) errabort("only accept one input vcf");
    if ( export_uncover == TRUE && outfile == FALSE) {
	warnings("export uncove region only used with option --out");
	export_uncover = FALSE;
    }
    char * input;
    if ( n == 0 ) input = strdup("-");
    else input = strdup(argv[optind]);
    htsFile * fp = read_vcf_file(input);
    enum htsExactFormat fmt = hts_get_format(fp)->format;
    if ( fmt != vcf && fmt != bcf ) errabort("This is not a VCF/BCF file : %s", input);
    bcf_hdr_t * hdr = bcf_hdr_read(fp);
    int n_samples = bcf_hdr_nsamples(hdr);
    if ( n_samples != 2 ) errabort("the input VCF/BCF file must contain only two samples! %d", n_samples);
    LOG("Using sample %s as ref ...", hdr->samples[0]);
    LOG("Using sample %s as test ...", hdr->samples[1]);
    uint32_t matrix[4][4] = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} };
    bcf1_t * v = bcf_init1();
    kstring_t str = { 0, 0, 0 };
    uint32_t line = 0;
    htsFile *out = NULL;
    if ( outfile && !check_filename(outfile) ) out = hts_open(outfile, mode);
    if ( out != NULL ) bcf_hdr_write(out, hdr);
    while ( bcf_read1(fp, hdr, v) >= 0 )
    {
	bcf_unpack(v, BCF_UN_STR|BCF_UN_FMT);
	int k;
	str.l = 0;
	int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT");
	if ( !bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, tag_id) ) warnings("There is no 'GT' in the header!");
	for ( i = 0; i < v->n_fmt; ++i )
	    if ( v->d.fmt[i].id == tag_id ) break;
	if ( i == v->n_fmt ) {
	    vcf_format1(hdr, v, &str);
	    LOG("There is no tag GT in this line : %s", str.s);
	    continue;
	}
	corr_t xy[2] = { {-1, -2, -2}, {-1, -2, -2} };
	bcf_fmt_t * fmt = &v->d.fmt[i];

	for ( i = 0; i < 2; ++i )
	{
	    int corr = i;
	    if ( fmt == NULL ) {
		if ( dotasref == TRUE ) xy[corr].alt = ALT_IS_REF;
		else xy[corr].alt = ALT_IS_UNC;
		continue;
	    }
	    int last = -2;
	    uint8_t *d = (uint8_t*)((char*)fmt->p + fmt->size*i);
	    for ( k = 0; k < fmt->n && d[k] != (uint8_t)bcf_int8_vector_end; ++k )
	    {
		int curr = d[k]>>1;
		if ( last != curr ) {
		    if ( curr ) {
			if ( last == -2 ) xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF;
			else xy[corr].alt =  ALT_IS_HET;
		    } else {
			xy[corr].alt =  dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC;
		    }
		} else {
		    if ( curr ) {
			xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF;
		    } else {
			xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC;
		    }
		}
		if (last == -2 ) {
		    xy[corr].min = xy[corr].max = curr;
		} else {
		    if ( curr < xy[corr].min ) xy[corr].min = curr;
		    else if ( curr > xy[corr].max ) xy[corr].max = curr;
		}
		last = curr;
	    }
	}
	matrix[xy[0].alt][xy[1].alt]++;
	if ( xy[0].alt != xy[1].alt && out != NULL) {
	    if ( xy[0].alt == ALT_IS_UNC || xy[1].alt == ALT_IS_UNC ) {
		if ( export_uncover == TRUE ) {
		    str.l = 0;
		    vcf_format1(hdr, v, &str);
		    vcf_write(out, hdr, v);
		}
	    } else {
		str.l = 0;
		vcf_format1(hdr, v, &str);
		vcf_write(out, hdr, v);
	    }
	}
	if ( xy[0].alt == ALT_IS_HET && xy[1].alt == ALT_IS_HET && (xy[0].min != xy[1].min || xy[0].max != xy[1].max ) ) {
	    bias++;
	    matrix[ALT_IS_HET][ALT_IS_HET]--;
	    if ( out != NULL ) {
		str.l = 0;
		vcf_format1(hdr, v, &str);
		vcf_write(out, hdr, v);
	    }
	}
	line++;
    }
    if ( out ) hts_close(out);
    if ( str.m ) free(str.s);
    write_report(matrix, hdr);
    bcf_hdr_destroy(hdr);
    free(input);
    bcf_destroy1(v);
    if ( outfile ) free(outfile);
    if ( report ) free(report);
    if ( hts_close(fp) ) warnings("hts_close returned non-zero status: %s", input);
    return 0;
}
Exemple #20
0
int vcf_index_stats(char *fname, int stats)
{
    char *fn_out = NULL;
    FILE *out;
    out = fn_out ? fopen(fn_out, "w") : stdout;

    const char **seq;
    int i, nseq;
    tbx_t *tbx = NULL;
    hts_idx_t *idx = NULL;

    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; }
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; }

    if ( hts_get_format(fp)->format==vcf )
    {
        tbx = tbx_index_load(fname);
        if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; }
    }
    else if ( hts_get_format(fp)->format==bcf )
    {
        idx = bcf_index_load(fname);
        if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; }
    }
    else
    {
        fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
        return 1;
    }

    seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
    uint64_t sum = 0;
    for (i=0; i<nseq; i++)
    {
        uint64_t records, v;
        hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
        sum+=records;
        if (stats&2 || !records) continue;
        bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
        int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
        fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
    }
    if (!sum)
    {
        // No counts found.
        // Is this because index version has no stored count data, or no records?
        bcf1_t *rec = bcf_init1();
        if (bcf_read1(fp, hdr, rec) >= 0)
        {
            fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
            return 1;
        }
        bcf_destroy1(rec);
    }
    if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
    free(seq);
    fclose(out);
    hts_close(fp);
    bcf_hdr_destroy(hdr);
    if (tbx)
        tbx_destroy(tbx);
    if (idx)
        hts_idx_destroy(idx);
    return 0;
}
Exemple #21
0
int main(int argc, char **argv)
{
    if (argc < 3) {
        fprintf(stderr,"%s <bcf file> <num vars>\n", argv[0]);
        return 1;
    }

    char *fname = argv[1];
    uint32_t num_vars = atoi(argv[2]);

    htsFile *fp    = hts_open(fname,"rb");
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    bcf1_t *line    = bcf_init1();
    int32_t *gt_p = NULL;

    uint32_t num_inds = bcf_hdr_nsamples(hdr);
    
    int32_t i, j, k, ntmp = 0, int_i = 0, two_bit_i = 0, sum, t_sum = 0;

    uint32_t num_ind_ints = 1 + ((num_inds - 1) / 16);

    pri_queue q = priq_new(0);
    priority p;

    uint32_t *packed_ints = (uint32_t *) calloc(num_ind_ints,
                                                sizeof(uint32_t));

    FILE *gt_of = fopen("gt.tmp.packed","wb");
    FILE *md_of = fopen("md.tmp.packed","w");

    uint32_t *md_index = (uint32_t *) malloc(num_vars * sizeof(uint32_t));
    uint32_t md_i = 0;

    unsigned long t_bcf_read = 0, 
                  t_bcf_dup = 0,
                  t_bcf_unpack = 0,
                  t_bcf_get_genotypes = 0,
                  t_bcf_hdr_nsamples = 0,
                  t_q = 0,
                  t_write = 0,
                  t_get_md = 0,
                  t_md_write = 0,
                  t_pack = 0;

    for (i = 0; i < num_vars; ++i) {
        sum = 0;
        int_i = 0;
        two_bit_i = 0;

        int r = bcf_read(fp, hdr, line);
        
        // Copy
        bcf1_t *t_line = bcf_dup(line);

        // Unpack
        bcf_unpack(t_line, BCF_UN_ALL);

        // Get metadata
        size_t len = strlen(bcf_hdr_id2name(hdr, t_line->rid)) +
                     10 + // max length of pos
                     strlen(t_line->d.id) +
                     strlen(t_line->d.allele[0]) +
                     strlen(t_line->d.allele[1]) +
                     4; //tabs
        char *md = (char *) malloc(len * sizeof(char));

        sprintf(md, "%s\t%d\t%s\t%s\t%s",
                     bcf_hdr_id2name(hdr, t_line->rid),
                     t_line->pos + 1,
                     t_line->d.id,
                     t_line->d.allele[0],
                     t_line->d.allele[1]); 

        // Write metadata
        md_i += strlen(md);
        md_index[i] = md_i;
        fprintf(md_of, "%s", md);

        // Get gentotypes
        uint32_t num_gts_per_sample = bcf_get_genotypes(hdr,
                                                        t_line,
                                                        &gt_p,
                                                        &ntmp);
        num_gts_per_sample /= num_inds;
        int32_t *gt_i = gt_p;
        
        // Pack genotypes
        for (j = 0; j < num_inds; ++j) {
            uint32_t gt = 0;
            for (k = 0; k < num_gts_per_sample; ++k) {
                gt += bcf_gt_allele(gt_i[k]);
            }

            packed_ints[int_i] += gt << (30 - 2*two_bit_i);
            two_bit_i += 1;
            if (two_bit_i == 16) {
                two_bit_i = 0;
                int_i += 1;
            }

            sum += gt;
            gt_i += num_gts_per_sample;
        }

        // Get a priority for the variant based on the sum and number of 
        // leading zeros
        p.sum = sum;
        uint32_t prefix_len = 0;
        j = 0;
        while ((j < num_ind_ints) && (packed_ints[j] == 0)){
            prefix_len += 32;
            j += 1;
        }
        if (j < num_ind_ints)
            prefix_len += nlz1(packed_ints[j]);
        
        // Push it into the q
        p.len = prefix_len;
        int *j = (int *) malloc (sizeof(int));
        j[0] = i;
        priq_push(q, j, p);

        // Write to file
        fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,gt_of);

        memset(packed_ints, 0, num_ind_ints*sizeof(uint32_t));

        t_sum += sum;

        bcf_destroy(t_line);
        free(md);
    }
    fclose(gt_of);
    fclose(md_of);


    md_of = fopen("md.tmp.packed","r");
    FILE *md_out = fopen("md.bim","w");
    gt_of = fopen("gt.tmp.packed","rb");
    FILE *s_gt_of = fopen("s.gt.tmp.packed","wb");

    // Get variants in order and rewrite a variant-major sorted matrix
    while ( priq_top(q, &p) != NULL ) {
        int *d = priq_pop(q, &p);

        uint32_t start = 0;
        if (*d != 0)
            start = md_index[*d - 1];

        uint32_t len = md_index[*d] - start;

        fseek(md_of, start*sizeof(char), SEEK_SET);
        char buf[len+1];
        fread(buf, sizeof(char), len, md_of);
        buf[len] = '\0';

        fseek(gt_of, (*d)*num_ind_ints*sizeof(uint32_t), SEEK_SET);
        fread(packed_ints, sizeof(uint32_t), num_ind_ints, gt_of);
        fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,s_gt_of);

        fprintf(md_out, "%s\n", buf);
    }

    fclose(md_out);
    fclose(md_of);
    fclose(gt_of);
    fclose(s_gt_of);


    /*
     * In a packed-int variant-major matrix there will be a num_vars
     * number of rows, and a num_inds number of values packed into
     * num_inds_ints number of intergers.  For examples, 16 rows of 16 values
     * will be 16 ints, where each int encodes 16 values.
     *
     */
    
    uint32_t num_var_ints = 1 + ((num_vars - 1) / 16);

    uint32_t *I_data = (uint32_t *)calloc(num_var_ints*16,sizeof(uint32_t));
    uint32_t **I = (uint32_t **)malloc(16*sizeof(uint32_t*));
    for (i = 0; i < 16; ++i)
        I[i] = I_data + i*num_var_ints;
    uint32_t I_i = 0, I_int_i = 0;

    uint32_t v;

    s_gt_of = fopen("s.gt.tmp.packed","rb");
    FILE *rs_gt_of = fopen("r.s.gt.tmp.packed","wb");

    // Write these to values to that this is a well-formed uncompressed 
    // packed int binary file (ubin) file
    fwrite(&num_vars, sizeof(uint32_t), 1, rs_gt_of);
    fwrite(&num_inds, sizeof(uint32_t), 1, rs_gt_of);
     
    /* 
     * we need to loop over the columns in the v-major file.
     * There are num_vars rows, and num_ind_ints 16-ind packed columns
     *
     * In this loop :
     *  i: cols in var-major form, rows in ind-major form
     *  j: rows in var-major form, cols in ind-major form
     */
    uint32_t num_inds_to_write = num_inds;
    for (i = 0; i < num_ind_ints; ++i) { // loop over each int col
        for (j = 0; j < num_vars; ++j) { // loop over head row in that col
            // skip to the value at the row/col
            fseek(s_gt_of, 
                  j*num_ind_ints*sizeof(uint32_t) + //row
                  i*sizeof(uint32_t), //col
                  SEEK_SET);

            fread(&v, sizeof(uint32_t), 1, s_gt_of);

            // one int corresponds to a col of 16 two-bit values
            // two_bit_i will move across the cols
            for (two_bit_i = 0; two_bit_i < 16; ++two_bit_i) {
                I[two_bit_i][I_i] += ((v >> (30 - 2*two_bit_i)) & 3) << 
                                     (30 - 2*I_int_i);
            }
            I_int_i += 1;

            if (I_int_i == 16) {
                I_i += 1;
                I_int_i = 0;
            }
        }

        // When we are at the end of the file, and the number of lines 
        // is not a factor of 16, only write out the lines that contain values
        if (num_inds_to_write >= 16) {
            fwrite(I_data,
                   sizeof(uint32_t),
                   num_var_ints*16,
                   rs_gt_of);
            num_inds_to_write -= 16;
        } else {
            fwrite(I_data,
                   sizeof(uint32_t),
                   num_var_ints*num_inds_to_write,
                   rs_gt_of);
        }
        memset(I_data, 0, num_var_ints*16*sizeof(uint32_t));
        I_int_i = 0;
        I_i = 0;
    }

    fclose(s_gt_of);
    fclose(rs_gt_of);

    free(md_index);
    free(packed_ints);
}
Exemple #22
0
static void init_data(args_t *args)
{
    bcf1_t *line = NULL;

    // With phased concat, the chunks overlap and come in the right order.  To
    // avoid opening all files at once, store start positions to recognise need
    // for the next one. This way we can keep only two open chunks at once.
    if ( args->phased_concat )
    {
        args->start_pos = (int*) malloc(sizeof(int)*args->nfnames);
        line = bcf_init();
    }

    kstring_t str = {0,0,0};
    int i, prev_chrid = -1;
    for (i=0; i<args->nfnames; i++)
    {
        htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
        bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
        args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
        if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
            error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);

        int j;
        for (j=0; j<bcf_hdr_nsamples(hdr); j++)
            if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) )
                error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);

        if ( args->phased_concat )
        {
            int ret = bcf_read(fp, hdr, line);
            if ( ret!=0 ) args->start_pos[i] = -2;  // empty file
            else
            {
                int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line));
                args->start_pos[i] = chrid==prev_chrid ? line->pos : -1;
                prev_chrid = chrid;
            }
        }
        bcf_hdr_destroy(hdr);
        hts_close(fp);
    }
    free(str.s);
    if ( line ) bcf_destroy(line);

    args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int));

    if ( args->phased_concat )
    {
        bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">");
        bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
    }
    if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
    if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
    if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);

    bcf_hdr_write(args->out_fh, args->out_hdr);

    if ( args->allow_overlaps )
    {
        args->files = bcf_sr_init();
        args->files->require_index = 1;
        if ( args->regions_list )
        {
            if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
                error("Failed to read the regions: %s\n", args->regions_list);
        }
        if ( args->remove_dups )
        {
            if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS;
            else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS;
            else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
            else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY;
            else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY;
            else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE;
            else error("The -D string \"%s\" not recognised.\n", args->remove_dups);
        }
        for (i=0; i<args->nfnames; i++)
            if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum));
    }
    else if ( args->phased_concat )
    {
        // Remove empty files from the list
        int nok = 0;
        while (1)
        {
            while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++;
            if ( nok==args->nfnames ) break;

            i = nok;
            while ( i<args->nfnames && args->start_pos[i]==-2 ) i++;
            if ( i==args->nfnames ) break;

            int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp;
            char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str;
        }
        for (i=nok; i<args->nfnames; i++) free(args->fnames[i]);
        args->nfnames = nok;

        for (i=1; i<args->nfnames; i++)
            if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] )
                error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]);

        args->prev_chr = -1;
        args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
        args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
        args->nmism  = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
        args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
        args->phase_set  = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
        args->files = bcf_sr_init();
        args->files->require_index = 1;
        args->ifname = 0;
    }
}
Exemple #23
0
int convert(int argc, char **argv)
{
    if (argc < 2) return convert_help();

    int c;
    char *in=NULL, *out=NULL, *bim=NULL, *vid=NULL, *tmp_dir=NULL, *ped=NULL;
    uint32_t num_fields, num_records, col = 2;
    int i_is_set = 0, 
        o_is_set = 0, 
        f_is_set = 0, 
        b_is_set = 0, 
        v_is_set = 0, 
        t_is_set = 0, 
        p_is_set = 0, 
        r_is_set = 0; 

    while((c = getopt (argc, argv, "hi:o:f:r:b:v:t:p:c:")) != -1) {
        switch (c) {
            case 'c':
                col = atoi(optarg);
                break;
            case 'p':
                p_is_set = 1;
                ped = optarg;
                break;
            case 't':
                t_is_set = 1;
                tmp_dir = optarg;
                break;
            case 'v':
                v_is_set = 1;
                vid = optarg;
                break;
            case 'b':
                b_is_set = 1;
                bim = optarg;
                break;
            case 'i':
                i_is_set = 1;
                in = optarg;
                break;
            case 'o':
                o_is_set = 1;
                out = optarg;
                break;
            case 'f':
                f_is_set = 1;
                num_fields = atoi(optarg);
                break;
            case 'r':
                r_is_set = 1;
                num_records = atoi(optarg);
                break;
            case 'h':
                convert_help();
                return 1;
            case '?':
                if ( (optopt == 'i') || 
                     (optopt == 'f') ||
                     (optopt == 'r') ||
                     (optopt == 't') ||
                     (optopt == 's') ||
                     (optopt == 'p') ||
                     (optopt == 'c') ||
                     (optopt == 'o') )
                    fprintf (stderr, "Option -%c requires an argument.\n",
                            optopt);
                else if (isprint (optopt))
                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
                else
                fprintf (stderr, "Unknown option character `\\x%x'.\n", optopt);
            default:
                convert_help();
                return 1;
        }
    }

    char *type = argv[0];

    if (i_is_set == 0) {
        printf("Input file is not set\n");
        return convert_help();
    } 

    if (strcmp(type, "bcf") == 0) {
        if ( (f_is_set == 0) || (r_is_set == 0) ) {

            fprintf(stderr,"Attempting to autodetect num of records "
                    "and fields from %s\n", in);
            //Try and auto detect the sizes, need the index
            tbx_t *tbx = NULL;
            hts_idx_t *idx = NULL;
            htsFile *fp    = hts_open(in,"rb");
            if ( !fp ) {
                fprintf(stderr,"Could not read %s\n", in);
                return 1;
            }

            bcf_hdr_t *hdr = bcf_hdr_read(fp);
            if ( !hdr ) {
                fprintf(stderr,"Could not read the header: %s\n", in);
                return 1;
            }

            if (hts_get_format(fp)->format==vcf) {
                tbx = tbx_index_load(in);
                if ( !tbx ) { 
                    fprintf(stderr,"Could not load TBI index: %s\n", in);
                    return 1;
                }
            } else if ( hts_get_format(fp)->format==bcf ) {
                idx = bcf_index_load(in);
                if ( !idx ) {
                    fprintf(stderr,"Could not load CSI index: %s\n", in);
                    return 1;
                }
            } else {
                fprintf(stderr,
                        "Could not detect the file type as VCF or BCF: %s\n",
                        in);
                return 1;
            }

            num_fields = hdr->n[BCF_DT_SAMPLE];

            num_records = 0;
            const char **seq;
            int nseq;
            seq = tbx ? tbx_seqnames(tbx, &nseq) : 
                    bcf_index_seqnames(idx, hdr, &nseq);
            int i;
            uint32_t sum = 0;
            for (i = 0; i < nseq; ++i) {
                uint64_t records, v;
                hts_idx_get_stat(tbx ? tbx->idx: idx, i, &records, &v);
                num_records += records;
            }

            fprintf(stderr, "Number of records:%u\tNumber of fields:%u\n",
                    num_records, num_fields);
            free(seq);
            hts_close(fp);
            bcf_hdr_destroy(hdr);
            if (idx)
                hts_idx_destroy(idx);
            if (tbx)
                tbx_destroy(tbx);
        }


        if (o_is_set == 0) {
            out  = (char*)malloc(strlen(in) + 5); // 5 for ext and \0
            strcpy(out,in);
            strcat(out, ".gqt");
        }
        if (b_is_set == 0) {
            bim  = (char*)malloc(strlen(in) + 5); // 5 for ext and \0
            strcpy(bim,in);
            strcat(bim, ".bim");
        }
        if (v_is_set == 0) {
            vid  = (char*)malloc(strlen(in) + 5); // 5 for ext and \0
            strcpy(vid,in);
            strcat(vid, ".vid");
        }
        if (t_is_set == 0) {
            tmp_dir  = (char*)malloc(3*sizeof(char)); // "./\0"
            strcpy(tmp_dir,"./");
        }

        int r = bcf_wahbm(in, out, bim, vid, tmp_dir, num_fields, num_records);

        return r;
    } 

    if (strcmp(type, "ped") == 0)  {
        if (o_is_set == 0) {
            if (p_is_set == 1) {
                out  = (char*)malloc(strlen(ped) + 4); // 4 for ext and \0
                strcpy(out,ped);
                strcat(out, ".db");
            } else {
                out  = (char*)malloc(strlen(in) + 4); // 4 for ext and \0
                strcpy(out,in);
                strcat(out, ".db");
            }
      }

      fprintf(stderr, "Creating sample database %s\n", out);
      return ped_ped(in, ped, col, out);
    }
    return convert_help();
}