Exemple #1
0
int main_vcfview(int argc, char *argv[])
{
	int c, clevel = -1, in_type = FT_BCF, out_type = FT_VCF;
	char *fname_out = NULL, moder[8], modew[8];

	while ((c = getopt(argc, argv, "l:bvo:n:z?hu")) >= 0) {
		switch (c) {
            case 'o': 
                switch (optarg[0]) {
                    case 'b': out_type = FT_BCF_GZ; break;
                    case 'u': out_type = FT_BCF; break;
                    case 'z': out_type = FT_VCF_GZ; break;
                    case 'v': out_type = FT_VCF; break;
                    default: error("The output type \"%s\" not recognised\n", optarg);
                }
                break;
            case 'l': clevel = atoi(optarg); out_type |= FT_GZ; break;
            case 'v': in_type  = FT_VCF; break;
            case 'b': out_type = FT_BCF_GZ; break;
            case 'u': out_type = FT_BCF; break;
            case 'z': out_type = FT_VCF_GZ; break;
            case 'n': fname_out = optarg; break;
            case '?':
            case 'h': usage(); return 1; break;
        }
    }
	if (argc!=optind+1) { usage(); return 1; }

    // Init reader
	strcpy(moder, "r");
	if ( (!strcmp("-",argv[optind]) && (in_type & FT_BCF)) || (hts_file_type(argv[optind]) & FT_BCF)) strcat(moder, "b");
	htsFile *fp_in = hts_open(argv[optind], moder, NULL);
    if ( !fp_in ) error("Fail to open: %s\n", argv[optind]);
	bcf_hdr_t *hdr = vcf_hdr_read(fp_in);
    if ( !hdr ) error("Fail to read VCF/BCF header: %s\n", argv[optind]); 
	bcf1_t *rec = bcf_init1();

    // Init writer
    strcpy(modew, "w");
    if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
    if (out_type & FT_GZ) strcat(modew,"z");
    if (out_type & FT_BCF) strcat(modew, "b");
    if (out_type == FT_BCF) strcat(modew, "u"); // uncompressed BCF output
    htsFile *fp_out = hts_open(fname_out ? fname_out : "-", modew, NULL);

    vcf_hdr_write(fp_out, hdr);
    while ( vcf_read1(fp_in, hdr, rec) >= 0) vcf_write1(fp_out, hdr, rec);

	bcf_destroy1(rec);
	bcf_hdr_destroy(hdr);
	hts_close(fp_in);
    hts_close(fp_out);

	return 0;
}
static void init_missed_line(args_t *args)
{
    int i;
    for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++)
    {
        args->aux.gts[i*2]   = bcf_gt_missing;
        args->aux.gts[i*2+1] = bcf_int32_vector_end;
    }
    args->missed_line = bcf_init1();
    bcf_update_genotypes(args->aux.hdr, args->missed_line, args->aux.gts, 2*bcf_hdr_nsamples(args->aux.hdr));
    bcf_float_set_missing(args->missed_line->qual);
}
Variant VariantBuilder::build() const {
  const auto new_variant_body = utils::make_shared_variant(bcf_init1());

  // Always build from scratch for now, until support for building from a starting variant is added
  build_from_scratch(new_variant_body);

  if ( m_enable_validation ) {
    post_build_validation(new_variant_body);
  }

  return Variant{m_header.m_header, new_variant_body};
}
Exemple #4
0
/**
 * Gets record from pool, creates a new record if necessary
 */
bcf1_t* BCFOrderedReader::get_bcf1_from_pool()
{
    if(!pool.empty())
    {
        bcf1_t* v = pool.front();
        pool.pop_front();
        return v;
    }
    else
    {
        return bcf_init1();
    }
};
Exemple #5
0
static void unread_vcf_line(args_t *args, bcf1_t **rec_ptr)
{
    bcf1_t *rec = *rec_ptr;
    if ( args->vcf_rbuf.n >= args->vcf_rbuf.m )
        error("FIXME: too many overlapping records near %s:%d\n", bcf_seqname(args->hdr,rec),rec->pos+1);

    // Insert the new record in the buffer. The line would be overwritten in
    // the next bcf_sr_next_line call, therefore we need to swap it with an
    // unused one
    int i = rbuf_append(&args->vcf_rbuf);
    if ( !args->vcf_buf[i] ) args->vcf_buf[i] = bcf_init1();
    bcf1_t *tmp = rec; *rec_ptr = args->vcf_buf[i]; args->vcf_buf[i] = tmp;
}
Exemple #6
0
static void phased_push(args_t *args, bcf1_t *arec, bcf1_t *brec)
{
    if ( arec && arec->errcode )
        error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[0].header,arec),arec->pos+1, args->files->readers[0].fname);
    if ( brec && brec->errcode )
        error("Parse error at %s:%d, cannot proceed: %s\n", bcf_seqname(args->files->readers[1].header,brec),brec->pos+1, args->files->readers[1].fname);

    int i, nsmpl = bcf_hdr_nsamples(args->out_hdr);
    int chr_id = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,arec));
    if ( args->prev_chr<0 || args->prev_chr!=chr_id )
    {
        if ( args->prev_chr>=0 ) phased_flush(args);

        for (i=0; i<nsmpl; i++)
            args->phase_set[i] = arec->pos+1;
        args->phase_set_changed = 1;

        if ( args->seen_seq[chr_id] ) error("The chromosome block %s is not contiguous\n", bcf_seqname(args->files->readers[0].header,arec));
        args->seen_seq[chr_id] = 1;
        args->prev_chr = chr_id;
        args->prev_pos_check = -1;
    }

    if ( !brec )
    {
        bcf_translate(args->out_hdr, args->files->readers[0].header, arec);
        if ( args->nswap )
            phase_update(args, args->out_hdr, arec);
        if ( !args->compact_PS || args->phase_set_changed )
        {
            bcf_update_format_int32(args->out_hdr,arec,"PS",args->phase_set,nsmpl);
            args->phase_set_changed = 0;
        }
        bcf_write(args->out_fh, args->out_hdr, arec);

        if ( arec->pos < args->prev_pos_check )
            error("FIXME, disorder: %s:%d in %s vs %d written  [3]\n", bcf_seqname(args->files->readers[0].header,arec), arec->pos+1,args->files->readers[0].fname, args->prev_pos_check+1);
        args->prev_pos_check = arec->pos;
        return;
    }

    int m = args->mbuf;
    args->nbuf += 2;
    hts_expand(bcf1_t*,args->nbuf,args->mbuf,args->buf);
    for (i=m; i<args->mbuf; i++)
        args->buf[i] = bcf_init1();

    SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->buf[args->nbuf-2]);
    SWAP(bcf1_t*, args->files->readers[1].buffer[0], args->buf[args->nbuf-1]);
}
Exemple #7
0
bcf1_t *vcfbuf_push(vcfbuf_t *buf, bcf1_t *rec, int swap)
{
    if ( !swap ) error("todo: swap=%d\n", swap);

    rbuf_expand0(&buf->rbuf, vcfrec_t, buf->rbuf.n+1, buf->vcf);

    int i = rbuf_append(&buf->rbuf);
    if ( !buf->vcf[i].rec ) buf->vcf[i].rec = bcf_init1();
    
    bcf1_t *ret = buf->vcf[i].rec;
    buf->vcf[i].rec = rec;
    buf->vcf[i].af_set = 0;

    return ret;
}
/**
 * Gets record from pool, creates a new record if necessary
 */
bcf1_t* BCFSyncedStreamReader::get_bcf1_from_pool()
{
    if(!pool.empty())
    {
        bcf1_t* v = pool.front();
        pool.pop_front();
        bcf_clear(v);
        return v;
    }
    else
    {
        bcf1_t* v = bcf_init1();
        bcf_clear(v);
        return v;
    }
}
Exemple #9
0
int main_getalt(int argc, char *argv[])
{
	int c;
	char *fn;
	BGZF *fp;
	bcf1_t *b;
	bcf_hdr_t *h;
	kstring_t s = {0,0,0};

	while ((c = getopt(argc, argv, "")) >= 0) {
	}
	if (argc - optind == 0) {
		fprintf(stderr, "Usage: bgt getalt <bgt-base>\n");
		return 1;
	}

	fn = (char*)calloc(strlen(argv[optind]) + 5, 1);
	sprintf(fn, "%s.bcf", argv[optind]);
	fp = bgzf_open(fn, "r");
	free(fn);
	assert(fp);

	h = bcf_hdr_read(fp);
	b = bcf_init1();
	while (bcf_read1(fp, b) >= 0) {
		char *ref, *alt;
		int l_ref, l_alt, i, min_l;
		bcf_get_ref_alt1(b, &l_ref, &ref, &l_alt, &alt);
		min_l = l_ref < l_alt? l_ref : l_alt;
		for (i = 0; i < min_l && ref[i] == alt[i]; ++i);
		s.l = 0;
		kputs(h->id[BCF_DT_CTG][b->rid].key, &s);
		kputc(':', &s); kputw(b->pos + 1 + i, &s);
		kputc(':', &s); kputw(b->rlen - i, &s);
		kputc(':', &s); kputsn(alt + i, l_alt - i, &s);
		puts(s.s);
	}
	bcf_destroy1(b);
	bcf_hdr_destroy(h);

	bgzf_close(fp);
	free(s.s);
	return 0;
}
Exemple #10
0
static bcf1_t *rec_set_info(args_t *args, bcf1_t *rec)
{
    bcf1_t *out = bcf_init1();
    out->rid  = rec->rid;
    out->pos  = rec->pos;
    out->rlen = rec->rlen;
    out->qual = rec->qual;
    out->n_allele = rec->n_allele;
    out->n_sample = 1;
    if ( args->keep_info )
    {
        out->n_info = rec->n_info;
        out->shared.m = out->shared.l = rec->shared.l;
        out->shared.s = (char*) malloc(out->shared.l);
        memcpy(out->shared.s,rec->shared.s,out->shared.l);
        return out;
    }

    // build the BCF record
    kstring_t tmp = {0,0,0};
    char *ptr = rec->shared.s;
    kputsn_(ptr, rec->unpack_size[0], &tmp); ptr += rec->unpack_size[0]; // ID
    kputsn_(ptr, rec->unpack_size[1], &tmp); ptr += rec->unpack_size[1]; // REF+ALT
    kputsn_(ptr, rec->unpack_size[2], &tmp);                             // FILTER
    if ( args->ninfo_tags )
    {
        int i;
        for (i=0; i<rec->n_info; i++)
        {
            bcf_info_t *info = &rec->d.info[i];
            int id = info->key;
            if ( !args->info_tags[id] ) continue;
            kputsn_(info->vptr - info->vptr_off, info->vptr_len + info->vptr_off, &tmp);
            out->n_info++;
        }
    }
    out->shared.m = tmp.m;
    out->shared.s = tmp.s;
    out->shared.l = tmp.l;
    out->unpacked = 0;
    return out;
}
Exemple #11
0
int main(int argc, char **argv)
{
    if (argc < 3) {
        fprintf(stderr,"%s <bcf file> <num vars>\n", argv[0]);
        return 1;
    }

    char *fname = argv[1];
    uint32_t num_vars = atoi(argv[2]);

    htsFile *fp    = hts_open(fname,"rb");
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    bcf1_t *line    = bcf_init1();
    int32_t *gt_p = NULL;

    uint32_t num_inds = bcf_hdr_nsamples(hdr);
    
    int32_t i, j, k, ntmp = 0, int_i = 0, two_bit_i = 0, sum, t_sum = 0;

    uint32_t num_ind_ints = 1 + ((num_inds - 1) / 16);

    pri_queue q = priq_new(0);
    priority p;

    uint32_t *packed_ints = (uint32_t *) calloc(num_ind_ints,
                                                sizeof(uint32_t));

    FILE *gt_of = fopen("gt.tmp.packed","wb");
    FILE *md_of = fopen("md.tmp.packed","w");

    uint32_t *md_index = (uint32_t *) malloc(num_vars * sizeof(uint32_t));
    uint32_t md_i = 0;

    unsigned long t_bcf_read = 0, 
                  t_bcf_dup = 0,
                  t_bcf_unpack = 0,
                  t_bcf_get_genotypes = 0,
                  t_bcf_hdr_nsamples = 0,
                  t_q = 0,
                  t_write = 0,
                  t_get_md = 0,
                  t_md_write = 0,
                  t_pack = 0;

    for (i = 0; i < num_vars; ++i) {
        sum = 0;
        int_i = 0;
        two_bit_i = 0;

        int r = bcf_read(fp, hdr, line);
        
        // Copy
        bcf1_t *t_line = bcf_dup(line);

        // Unpack
        bcf_unpack(t_line, BCF_UN_ALL);

        // Get metadata
        size_t len = strlen(bcf_hdr_id2name(hdr, t_line->rid)) +
                     10 + // max length of pos
                     strlen(t_line->d.id) +
                     strlen(t_line->d.allele[0]) +
                     strlen(t_line->d.allele[1]) +
                     4; //tabs
        char *md = (char *) malloc(len * sizeof(char));

        sprintf(md, "%s\t%d\t%s\t%s\t%s",
                     bcf_hdr_id2name(hdr, t_line->rid),
                     t_line->pos + 1,
                     t_line->d.id,
                     t_line->d.allele[0],
                     t_line->d.allele[1]); 

        // Write metadata
        md_i += strlen(md);
        md_index[i] = md_i;
        fprintf(md_of, "%s", md);

        // Get gentotypes
        uint32_t num_gts_per_sample = bcf_get_genotypes(hdr,
                                                        t_line,
                                                        &gt_p,
                                                        &ntmp);
        num_gts_per_sample /= num_inds;
        int32_t *gt_i = gt_p;
        
        // Pack genotypes
        for (j = 0; j < num_inds; ++j) {
            uint32_t gt = 0;
            for (k = 0; k < num_gts_per_sample; ++k) {
                gt += bcf_gt_allele(gt_i[k]);
            }

            packed_ints[int_i] += gt << (30 - 2*two_bit_i);
            two_bit_i += 1;
            if (two_bit_i == 16) {
                two_bit_i = 0;
                int_i += 1;
            }

            sum += gt;
            gt_i += num_gts_per_sample;
        }

        // Get a priority for the variant based on the sum and number of 
        // leading zeros
        p.sum = sum;
        uint32_t prefix_len = 0;
        j = 0;
        while ((j < num_ind_ints) && (packed_ints[j] == 0)){
            prefix_len += 32;
            j += 1;
        }
        if (j < num_ind_ints)
            prefix_len += nlz1(packed_ints[j]);
        
        // Push it into the q
        p.len = prefix_len;
        int *j = (int *) malloc (sizeof(int));
        j[0] = i;
        priq_push(q, j, p);

        // Write to file
        fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,gt_of);

        memset(packed_ints, 0, num_ind_ints*sizeof(uint32_t));

        t_sum += sum;

        bcf_destroy(t_line);
        free(md);
    }
    fclose(gt_of);
    fclose(md_of);


    md_of = fopen("md.tmp.packed","r");
    FILE *md_out = fopen("md.bim","w");
    gt_of = fopen("gt.tmp.packed","rb");
    FILE *s_gt_of = fopen("s.gt.tmp.packed","wb");

    // Get variants in order and rewrite a variant-major sorted matrix
    while ( priq_top(q, &p) != NULL ) {
        int *d = priq_pop(q, &p);

        uint32_t start = 0;
        if (*d != 0)
            start = md_index[*d - 1];

        uint32_t len = md_index[*d] - start;

        fseek(md_of, start*sizeof(char), SEEK_SET);
        char buf[len+1];
        fread(buf, sizeof(char), len, md_of);
        buf[len] = '\0';

        fseek(gt_of, (*d)*num_ind_ints*sizeof(uint32_t), SEEK_SET);
        fread(packed_ints, sizeof(uint32_t), num_ind_ints, gt_of);
        fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,s_gt_of);

        fprintf(md_out, "%s\n", buf);
    }

    fclose(md_out);
    fclose(md_of);
    fclose(gt_of);
    fclose(s_gt_of);


    /*
     * In a packed-int variant-major matrix there will be a num_vars
     * number of rows, and a num_inds number of values packed into
     * num_inds_ints number of intergers.  For examples, 16 rows of 16 values
     * will be 16 ints, where each int encodes 16 values.
     *
     */
    
    uint32_t num_var_ints = 1 + ((num_vars - 1) / 16);

    uint32_t *I_data = (uint32_t *)calloc(num_var_ints*16,sizeof(uint32_t));
    uint32_t **I = (uint32_t **)malloc(16*sizeof(uint32_t*));
    for (i = 0; i < 16; ++i)
        I[i] = I_data + i*num_var_ints;
    uint32_t I_i = 0, I_int_i = 0;

    uint32_t v;

    s_gt_of = fopen("s.gt.tmp.packed","rb");
    FILE *rs_gt_of = fopen("r.s.gt.tmp.packed","wb");

    // Write these to values to that this is a well-formed uncompressed 
    // packed int binary file (ubin) file
    fwrite(&num_vars, sizeof(uint32_t), 1, rs_gt_of);
    fwrite(&num_inds, sizeof(uint32_t), 1, rs_gt_of);
     
    /* 
     * we need to loop over the columns in the v-major file.
     * There are num_vars rows, and num_ind_ints 16-ind packed columns
     *
     * In this loop :
     *  i: cols in var-major form, rows in ind-major form
     *  j: rows in var-major form, cols in ind-major form
     */
    uint32_t num_inds_to_write = num_inds;
    for (i = 0; i < num_ind_ints; ++i) { // loop over each int col
        for (j = 0; j < num_vars; ++j) { // loop over head row in that col
            // skip to the value at the row/col
            fseek(s_gt_of, 
                  j*num_ind_ints*sizeof(uint32_t) + //row
                  i*sizeof(uint32_t), //col
                  SEEK_SET);

            fread(&v, sizeof(uint32_t), 1, s_gt_of);

            // one int corresponds to a col of 16 two-bit values
            // two_bit_i will move across the cols
            for (two_bit_i = 0; two_bit_i < 16; ++two_bit_i) {
                I[two_bit_i][I_i] += ((v >> (30 - 2*two_bit_i)) & 3) << 
                                     (30 - 2*I_int_i);
            }
            I_int_i += 1;

            if (I_int_i == 16) {
                I_i += 1;
                I_int_i = 0;
            }
        }

        // When we are at the end of the file, and the number of lines 
        // is not a factor of 16, only write out the lines that contain values
        if (num_inds_to_write >= 16) {
            fwrite(I_data,
                   sizeof(uint32_t),
                   num_var_ints*16,
                   rs_gt_of);
            num_inds_to_write -= 16;
        } else {
            fwrite(I_data,
                   sizeof(uint32_t),
                   num_var_ints*num_inds_to_write,
                   rs_gt_of);
        }
        memset(I_data, 0, num_var_ints*16*sizeof(uint32_t));
        I_int_i = 0;
        I_i = 0;
    }

    fclose(s_gt_of);
    fclose(rs_gt_of);

    free(md_index);
    free(packed_ints);
}
Exemple #12
0
int main_view(int argc, char *argv[])
{
	int i, c, n_files = 0, out_bcf = 0, clevel = -1, multi_flag = 0, excl = 0, not_vcf = 0, in_mem = 0, u_set = 0;
	long seekn = -1, n_rec = LONG_MAX, n_read = 0;
	bgtm_t *bm = 0;
	bcf1_t *b;
	htsFile *out = 0;
	char modew[8], *reg = 0, *site_flt = 0;
	void *bed = 0;
	int n_groups = 0;
	char *gexpr[BGT_MAX_GROUPS], *aexpr = 0, *dbfn = 0, *fmt = 0;
	bgt_file_t **files = 0;
	fmf_t *vardb = 0;

	while ((c = getopt(argc, argv, "ubs:r:l:CMGB:ef:g:a:i:n:SHt:d:")) >= 0) {
		if (c == 'b') out_bcf = 1;
		else if (c == 'r') reg = optarg;
		else if (c == 'l') clevel = atoi(optarg);
		else if (c == 'e') excl = 1;
		else if (c == 'u') u_set = 1;
		else if (c == 'B') bed = bed_read(optarg);
		else if (c == 'C') multi_flag |= BGT_F_SET_AC;
		else if (c == 'G') multi_flag |= BGT_F_NO_GT;
		else if (c == 'S') multi_flag |= BGT_F_NO_GT | BGT_F_CNT_AL, not_vcf = 1;
		else if (c == 'H') multi_flag |= BGT_F_NO_GT | BGT_F_CNT_HAP, not_vcf = 1;
		else if (c == 'M') in_mem = 1;
		else if (c == 'i') seekn = atol(optarg) - 1;
		else if (c == 'n') n_rec = atol(optarg);
		else if (c == 'f') site_flt = optarg;
		else if (c == 't') fmt = optarg, not_vcf = 1;
		else if (c == 'd') dbfn = optarg;
		else if (c == 's' && n_groups < BGT_MAX_GROUPS) gexpr[n_groups++] = optarg;
		else if (c == 'a') aexpr = optarg;
	}
	if (n_rec < 0) {
		fprintf(stderr, "[E::%s] option -n must be at least 0.\n", __func__);
		return 1;
	}
	if (clevel > 9) clevel = 9;
	if (u_set) clevel = 0, out_bcf = 1;
	if (n_groups > 1) multi_flag |= BGT_F_SET_AC;
	if (argc - optind < 1) {
		fprintf(stderr, "Usage: bgt %s [options] <bgt-prefix> [...]", argv[0]);
		fputc('\n', stderr);
		fprintf(stderr, "Options:\n");
		fprintf(stderr, "  Sample selection:\n");
		fprintf(stderr, "    -s EXPR      samples list (,sample1,sample2 or a file or expr; see Notes below) [all]\n");
		fprintf(stderr, "  Site selection:\n");
		fprintf(stderr, "    -r STR       region [all]\n");
		fprintf(stderr, "    -B FILE      extract variants overlapping BED FILE []\n");
		fprintf(stderr, "    -e           exclude variants overlapping BED FILE (effective with -B)\n");
		fprintf(stderr, "    -i INT       process from the INT-th record (1-based) []\n");
		fprintf(stderr, "    -n INT       process at most INT records []\n");
		fprintf(stderr, "    -d FILE      variant annotations in FMF (to work with -a) []\n");
		fprintf(stderr, "    -M           load variant annotations in RAM (only with -d)\n");
		fprintf(stderr, "    -a EXPR      alleles list chr:1basedPos:refLen:seq (,allele1,allele2 or a file or expr) []\n");
		fprintf(stderr, "    -f STR       frequency filters []\n");
		fprintf(stderr, "  VCF output:\n");
		fprintf(stderr, "    -b           BCF output (effective without -S/-H)\n");
		fprintf(stderr, "    -l INT       compression level for BCF [default]\n");
		fprintf(stderr, "    -u           equivalent to -bl0 (overriding -b and -l)\n");
		fprintf(stderr, "    -G           don't output sample genotypes\n");
		fprintf(stderr, "    -C           write AC/AN to the INFO field (auto applied with -f or multipl -s)\n");
		fprintf(stderr, "  Non-VCF output:\n");
		fprintf(stderr, "    -S           show samples with a set of alleles (with -a)\n");
		fprintf(stderr, "    -H           count of haplotypes with a set of alleles (with -a)\n");
		fprintf(stderr, "    -t STR       comma-delimited list of fields to output. Accepted variables:\n");
		fprintf(stderr, "                 AC, AN, AC#, AN#, CHROM, POS, END, REF, ALT (# for a group number)\n");
		fprintf(stderr, "Notes:\n");
		fprintf(stderr, "  For option -s/-a, EXPR can be one of:\n");
		fprintf(stderr, "    1) comma-delimited list following a colon/comma. e.g. -s,NA12878,NA12044\n");
		fprintf(stderr, "    2) space-delimited file with the first column giving a sample/allele name. e.g. -s list.txt\n");
		fprintf(stderr, "    3) expression if .spl/-d file contains metadata. e.g.: -s\"gender=='M'&&population!='CEU'\"\n");
		fprintf(stderr, "  If multiple -s is specified, the AC/AN of the first group will be written to VCF INFO AC1/AN1,\n");
		fprintf(stderr, "  the second to AC2/AN2, etc.\n");
		return 1;
	}

	if (dbfn && in_mem) vardb = fmf_read(dbfn), dbfn = 0;

	if ((multi_flag&(BGT_F_CNT_AL|BGT_F_CNT_HAP)) && aexpr == 0) {
		fprintf(stderr, "[E::%s] -a must be specified when -S/-H is in use.\n", __func__);
		return 1;
	}

	n_files = argc - optind;
	files = (bgt_file_t**)calloc(n_files, sizeof(bgt_file_t*));
	for (i = 0; i < n_files; ++i) {
		files[i] = bgt_open(argv[optind+i]);
		if (files[i] == 0) {
			fprintf(stderr, "[E::%s] failed to open BGT with prefix '%s'\n", __func__, argv[optind+i]);
			return 1; // FIXME: memory leak
		}
	}

	bm = bgtm_reader_init(n_files, files);
	bgtm_set_flag(bm, multi_flag);
	if (site_flt && bgtm_set_flt_site(bm, site_flt) != 0) {
		fprintf(stderr, "[E::%s] failed to set frequency filters. Syntax error?\n", __func__);
		return 1;
	}
	if (reg && bgtm_set_region(bm, reg) < 0) {
		fprintf(stderr, "[E::%s] failed to set region. Region format error?\n", __func__);
		return 1;
	}
	if (bed) bgtm_set_bed(bm, bed, excl);
	if (fmt && bgtm_set_table(bm, fmt) < 0) {
		fprintf(stderr, "[E::%s] failed to set tabular output.\n", __func__);
		return 1;
	}
	if (seekn > 0) bgtm_set_start(bm, seekn);
	if (aexpr) {
		int n_al;
		n_al = bgtm_set_alleles(bm, aexpr, vardb, dbfn);
		if (n_al < 0) {
			fprintf(stderr, "[E::%s] failed to set alleles.\n", __func__);
			return 1;
		} else if (n_al == 0)
			fprintf(stderr, "[W::%s] no alleles selected.\n", __func__);
	}
	for (i = 0; i < n_groups; ++i) {
		if (bgtm_add_group(bm, gexpr[i]) < 0) {
			fprintf(stderr, "[E::%s] failed to add sample group '%s'.\n", __func__, gexpr[i]);
			return 1;
		}
	}
	bgtm_prepare(bm); // bgtm_prepare() generates the VCF header

	if (!not_vcf) {
		strcpy(modew, "w");
		if (out_bcf) strcat(modew, "b");
		sprintf(modew + strlen(modew), "%d", clevel);
		out = hts_open("-", modew, 0);
		vcf_hdr_write(out, bm->h_out);
	}

	b = bcf_init1();
	while (bgtm_read(bm, b) >= 0 && n_read < n_rec) {
		if (out) vcf_write1(out, bm->h_out, b);
		if (fmt && bm->n_fields > 0) puts(bm->tbl_line.s);
		++n_read;
	}
	bcf_destroy1(b);

	if (not_vcf && bm->n_aal > 0) {
		if (bm->flag & BGT_F_CNT_HAP) {
			bgt_hapcnt_t *hc;
			int n_hap;
			char *s;
			hc = bgtm_hapcnt(bm, &n_hap);
			s = bgtm_hapcnt_print_destroy(bm, n_hap, hc);
			fputs(s, stdout);
			free(s);
		}
		if (bm->flag & BGT_F_CNT_AL) {
			char *s;
			if ((s = bgtm_alcnt_print(bm)) != 0)
				fputs(s, stdout);
			free(s);
		}
	}

	if (out) hts_close(out);
	bgtm_reader_destroy(bm);
	if (bed) bed_destroy(bed);
	for (i = 0; i < n_files; ++i) bgt_close(files[i]);
	free(files);
	if (vardb) fmf_destroy(vardb);
	return 0;
}
Exemple #13
0
static int mpileup(mplp_conf_t *conf)
{
    if (conf->nfiles == 0) {
        fprintf(stderr,"[%s] no input file/data given\n", __func__);
        exit(EXIT_FAILURE);
    }

    mplp_ref_t mp_ref = MPLP_REF_INIT;
    conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t));
    conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*));
    conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*));
    conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int));

    // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index
    // must be kept in the memory for the whole time which can be a problem with many bams.
    // Therefore if none or only one region is requested, we initialize the bam iterator as
    // before and free the index. Only when multiple regions are queried, we keep the index.
    int nregs = 0;
    if ( conf->reg_fname )
    {
        if ( conf->reg_is_file )
        {
            conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
            if ( !conf->reg ) {
                fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
                exit(EXIT_FAILURE);
            }
        }
        else
        {
            conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
            if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
                fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
                exit(EXIT_FAILURE);
            }
        }
        nregs = regidx_nregs(conf->reg);
        conf->reg_itr = regitr_init(conf->reg);
        regitr_loop(conf->reg_itr);   // region iterator now positioned at the first region
    }

    // read the header of each file in the list and initialize data
    // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least!
    bam_hdr_t *hdr = NULL;      // header of first file in input list
    int i;
    for (i = 0; i < conf->nfiles; ++i) {
        bam_hdr_t *h_tmp;
        conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t));
        conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb");
        if ( !conf->mplp_data[i]->fp )
        {
            fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
            exit(EXIT_FAILURE);
        }
        if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
            fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
            exit(EXIT_FAILURE);
        }
        if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
            fprintf(stderr, "[%s] failed to process %s: %s\n",
                    __func__, conf->fai_fname, strerror(errno));
            exit(EXIT_FAILURE);
        }
        conf->mplp_data[i]->conf = conf;
        conf->mplp_data[i]->ref = &mp_ref;
        h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
        if ( !h_tmp ) {
            fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
            exit(EXIT_FAILURE);
        }
        conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
        conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
        if ( conf->mplp_data[i]->bam_id<0 )
        {
            // no usable readgroups in this bam, it can be skipped
            sam_close(conf->mplp_data[i]->fp);
            free(conf->mplp_data[i]);
            bam_hdr_destroy(h_tmp);
            free(conf->files[i]);
            if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1));
            conf->nfiles--;
            i--;
            continue;
        }
        if (conf->reg) {
            hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
            if (idx == NULL) {
                fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
                exit(EXIT_FAILURE);
            }
            conf->buf.l = 0;
            ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
            conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
            if ( !conf->mplp_data[i]->iter ) 
            {
                conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
                if ( conf->mplp_data[i]->iter ) {
                    fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
                    exit(EXIT_FAILURE);
                }
                fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
                exit(EXIT_FAILURE);
            }
            if ( nregs==1 ) // no need to keep the index in memory
               hts_idx_destroy(idx);
            else
                conf->mplp_data[i]->idx = idx;
        }

        if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */
        else {
            // FIXME: check consistency between h and h_tmp
            bam_hdr_destroy(h_tmp);

            // we store only the first file's header; it's (alleged to be)
            // compatible with the i-th file's target_name lookup needs
            conf->mplp_data[i]->h = hdr;
        }
    }
    // allocate data storage proportionate to number of samples being studied sm->n
    bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
    conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
    conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
    conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));  

    fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
    // write the VCF header
    conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
    if (conf->bcf_fp == NULL) {
        fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
        exit(EXIT_FAILURE);
    }
    if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);

    // BCF header creation
    conf->bcf_hdr = bcf_hdr_init("w");
    conf->buf.l = 0;

    if (conf->record_cmd_line)
    {
        ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
        bcf_hdr_append(conf->bcf_hdr, conf->buf.s);

        conf->buf.l = 0;
        ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
        for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
        kputc('\n', &conf->buf);
        bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
    }

    if (conf->fai_fname)
    {
        conf->buf.l = 0;
        ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
        bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
    }

    // Translate BAM @SQ tags to BCF ##contig tags
    // todo: use/write new BAM header manipulation routines, fill also UR, M5
    for (i=0; i<hdr->n_targets; i++)
    {
        conf->buf.l = 0;
        ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]);
        bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
    }
    conf->buf.l = 0;

    bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
#if CDF_MWU_TESTS
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
#endif
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
    bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
    if ( conf->fmt_flag&B2B_FMT_DP )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
    if ( conf->fmt_flag&B2B_FMT_DV )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
    if ( conf->fmt_flag&B2B_FMT_DPR )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
    if ( conf->fmt_flag&B2B_INFO_DPR )
        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
    if ( conf->fmt_flag&B2B_FMT_DP4 )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
    if ( conf->fmt_flag&B2B_FMT_SP )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
    if ( conf->fmt_flag&B2B_FMT_AD )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">");
    if ( conf->fmt_flag&B2B_FMT_ADF )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">");
    if ( conf->fmt_flag&B2B_FMT_ADR )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">");
    if ( conf->fmt_flag&B2B_INFO_AD )
        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">");
    if ( conf->fmt_flag&B2B_INFO_ADF )
        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">");
    if ( conf->fmt_flag&B2B_INFO_ADR )
        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">");
    if ( conf->gvcf )
        gvcf_update_header(conf->gvcf, conf->bcf_hdr);

    int nsmpl;
    const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl);
    for (i=0; i<nsmpl; i++)
        bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
    bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr);

    conf->bca = bcf_call_init(-1., conf->min_baseQ);
    conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
    conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
    conf->bca->min_frac = conf->min_frac;
    conf->bca->min_support = conf->min_support;
    conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;

    conf->bc.bcf_hdr = conf->bcf_hdr;
    conf->bc.n  = nsmpl;
    conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
    if (conf->fmt_flag)
    {
        assert( sizeof(float)==sizeof(int32_t) );
        conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4);
        conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32
        if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
        {
            // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
            conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
            conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
            for (i=0; i<nsmpl; i++)
            {
                conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES;
                conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES;
            }
        }
    }

    // init mpileup
    conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
    if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter);
    if ( (double)conf->max_depth * conf->nfiles > 1<<20)
        fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles);
    if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 )
        fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl);
    bam_mplp_set_maxcnt(conf->iter, conf->max_depth);
    conf->max_indel_depth = conf->max_indel_depth * nsmpl;
    conf->bcf_rec = bcf_init1();
    bam_mplp_constructor(conf->iter, pileup_constructor);

    // Run mpileup for multiple regions
    if ( nregs )
    {
        int ireg = 0;
        do 
        {
            // first region is already positioned
            if ( ireg++ > 0 )
            {
                conf->buf.l = 0;
                ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end);

                for (i=0; i<conf->nfiles; i++) 
                {
                    hts_itr_destroy(conf->mplp_data[i]->iter);
                    conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
                    if ( !conf->mplp_data[i]->iter ) 
                    {
                        conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
                        if ( conf->mplp_data[i]->iter ) {
                            fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
                            exit(EXIT_FAILURE);
                        }
                        fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
                        exit(EXIT_FAILURE);
                    }
                    bam_mplp_reset(conf->iter);
                }
            }
            mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
        }
        while ( regitr_loop(conf->reg_itr) );
    }
    else
        mpileup_reg(conf,0,0);

    flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);

    // clean up
    free(conf->bc.tmp.s);
    bcf_destroy1(conf->bcf_rec);
    if (conf->bcf_fp)
    {
        hts_close(conf->bcf_fp);
        bcf_hdr_destroy(conf->bcf_hdr);
        bcf_call_destroy(conf->bca);
        free(conf->bc.PL);
        free(conf->bc.DP4);
        free(conf->bc.ADR);
        free(conf->bc.ADF);
        free(conf->bc.fmt_arr);
        free(conf->bcr);
    }
    if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
    free(conf->buf.s);
    for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]);
    free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp);
    bam_mplp_destroy(conf->iter);
    bam_hdr_destroy(hdr);
    for (i = 0; i < conf->nfiles; ++i) {
        if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx);
        sam_close(conf->mplp_data[i]->fp);
        if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter);
        free(conf->mplp_data[i]);
    }
    if ( conf->reg_itr ) regitr_destroy(conf->reg_itr);
    free(conf->mplp_data); free(conf->plp); free(conf->n_plp);
    free(mp_ref.ref[0]);
    free(mp_ref.ref[1]);
    return 0;
}
Exemple #14
0
int vcf_index_stats(char *fname, int stats)
{
    char *fn_out = NULL;
    FILE *out;
    out = fn_out ? fopen(fn_out, "w") : stdout;

    const char **seq;
    int i, nseq;
    tbx_t *tbx = NULL;
    hts_idx_t *idx = NULL;

    htsFile *fp = hts_open(fname,"r");
    if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; }
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; }

    if ( hts_get_format(fp)->format==vcf )
    {
        tbx = tbx_index_load(fname);
        if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; }
    }
    else if ( hts_get_format(fp)->format==bcf )
    {
        idx = bcf_index_load(fname);
        if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; }
    }
    else
    {
        fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname);
        return 1;
    }

    seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq);
    uint64_t sum = 0;
    for (i=0; i<nseq; i++)
    {
        uint64_t records, v;
        hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v);
        sum+=records;
        if (stats&2 || !records) continue;
        bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL);
        int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1;
        fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records);
    }
    if (!sum)
    {
        // No counts found.
        // Is this because index version has no stored count data, or no records?
        bcf1_t *rec = bcf_init1();
        if (bcf_read1(fp, hdr, rec) >= 0)
        {
            fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname);
            return 1;
        }
        bcf_destroy1(rec);
    }
    if (stats&2) fprintf(out, "%" PRIu64 "\n", sum);
    free(seq);
    fclose(out);
    hts_close(fp);
    bcf_hdr_destroy(hdr);
    if (tbx)
        tbx_destroy(tbx);
    if (idx)
        hts_idx_destroy(idx);
    return 0;
}
Exemple #15
0
//{{{ void print_query_result_offset(uint32_t *mask,
void print_query_result_offset(uint32_t *mask,
                               uint32_t mask_len,
                               uint32_t *vids,
                               struct gqt_query *q,
                               uint32_t **counts,
                               uint32_t *id_lens,
                               uint32_t *U_R,
                               uint32_t U_R_len,
                               char **id_query_list,
                               char **gt_query_list,
                               uint32_t num_qs,
                               uint32_t num_fields,
                               char *off_file_name,
                               char *source_file,
                               char *full_cmd)
{
    struct off_file *off_f = open_off_file(off_file_name);
    struct bcf_file bcf_f = init_bcf_file(source_file);

    char *sample_names = NULL;

    uint32_t i,j,k,line_idx,bytes, bit_i = 0;
    int r;
    for (i = 0; i < U_R_len; ++i) {
        if (i == 0 )
            r = asprintf(&sample_names,
                         "%s",
                         bcf_f.hdr->samples[U_R[i]]);
        else
            r = asprintf(&sample_names,
                         "%s,%s",
                         sample_names,
                         bcf_f.hdr->samples[U_R[i]]);
        if (r == -1)
            err(EX_OSERR, "asprintf error");
    }

    if (bcf_hdr_set_samples(bcf_f.hdr, sample_names, 0) != 0)
        errx(EX_DATAERR, "Error setting samples: %s\n", source_file);

    char *info_s;
 
    for (i = 0; i < num_qs; i++) {
        if ( q[i].variant_op == p_count ) {
            r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Integer,"
                         "Description=\"GQT count result from "
                         "phenotype:'%s' genotype:'%s'\">",
                         i, id_query_list[i], gt_query_list[i]);
            if (r == -1) err(EX_OSERR, "asprintf error");

            if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
                errx(EX_DATAERR, "Error updating header: %s\n", source_file);

        } else if ( q[i].variant_op == p_pct ) {
            r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Float,"
                         "Description=\"GQT percent result from "
                         "phenotype:'%s' genotype:'%s'\">",
                         i, id_query_list[i], gt_query_list[i]);
            if (r == -1) err(EX_OSERR, "asprintf error");

            if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
                errx(EX_DATAERR, "Error updating header: %s\n", source_file);

        } else if ( q[i].variant_op == p_maf ) {
            r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Float,"
                         "Description=\"GQT maf result from "
                         "phenotype:'%s' genotype:'%s'\">",
                         i, id_query_list[i], gt_query_list[i]);

            if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
                errx(EX_DATAERR, "Error updating header: %s\n", source_file);
        }

    }

    r = asprintf(&info_s, "##%s_queryVersion=%s", PROGRAM_NAME, VERSION);
    if (r == -1) err(EX_OSERR, "asprintf error");

    if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
        errx(EX_DATAERR, "Error updating header: %s\n", source_file);

    r = asprintf(&info_s, "##%s_queryCommand=%s", PROGRAM_NAME, full_cmd);
    if (r == -1) err(EX_OSERR, "asprintf error");

    if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
        errx(EX_DATAERR, "Error updating header: %s\n", source_file);

    htsFile *out_f = hts_open("-","w");
    if ( !out_f )
        err(EX_DATAERR, "Could open output file");

    bcf_hdr_write(out_f, bcf_f.hdr);

    bcf_f.line = bcf_init1();

    for (i=0; i < mask_len; ++i) {
        bytes = mask[i];
	if (bytes == 0)
            continue; /* skip a bunch of ops if you can */
        for (j=0; j < 32; j++) {
            if (bytes & 1 << (31 - j)) {
	        line_idx = i*32+j;

                r = goto_bcf_line(&bcf_f, off_f, line_idx);

                if (r == -1) 
                    err(EX_NOINPUT,
                        "Error seeking file '%s'", bcf_f.file_name);

                r = get_bcf_line(&bcf_f);
                if (r == -1) 
                    err(EX_NOINPUT,
                        "Error reading file '%s'", bcf_f.file_name);

                for (k=0; k < num_qs; k++) {
                    r = asprintf(&info_s, "GQT_%u", k);
                    if (r == -1)
                        err(EX_OSERR, "asprintf error");

                    if ( q[k].variant_op == p_count ) {
                        int32_t v = counts[k][line_idx];
                        if (bcf_update_info_int32(bcf_f.hdr,
                                                  bcf_f.line,
                                                  info_s,
                                                  &v,
                                                  1) != 0)
                            errx(EX_DATAERR,
                                 "Error adding to info field: %s\n",
                                 bcf_f.file_name);
                    } else if (q[k].variant_op == p_pct) {
                        float v = ((float)counts[k][line_idx])/
                                    ((float) id_lens[k]);
                        if (bcf_update_info_float(bcf_f.hdr,
                                                  bcf_f.line,
                                                  info_s,
                                                  &v,
                                                  1) != 0)
                            errx(EX_DATAERR,
                                 "Error adding to info field: %s\n",
                                 bcf_f.file_name);

                    } else if (q[k].variant_op == p_maf) {
                        float v = ((float)counts[k][line_idx])/
                                    (((float) id_lens[k])*2.0);
                        if (bcf_update_info_float(bcf_f.hdr,
                                                  bcf_f.line,
                                                  info_s,
                                                  &v,
                                                  1) != 0)
                            errx(EX_DATAERR,
                                 "Error adding to info field: %s\n",
                                 bcf_f.file_name);
                    }
                }

                bcf_write(out_f, bcf_f.hdr, bcf_f.line);

            }
	    bit_i++;
	    if (bit_i == num_fields)
	        break;
        }

        if (bit_i == num_fields)
            break;
    }
    hts_close(out_f);
    destroy_off_file(off_f);
}
Exemple #16
0
//{{{ void get_bcf_query_result(uint32_t *mask,
void get_bcf_query_result(uint32_t *mask,
                        uint32_t mask_len,
                        struct gqt_query *q,
                        char **id_query_list,
                        uint32_t *id_lens,
                        uint32_t num_qs,
                        uint32_t num_fields,
                        char *vid_file_name,
                        char *bcf_file_name,
                        int bcf_output)
{

    /* The VID file contains the line numbers of the variants after they have
     * been sorted.  To reach back into the BCF file to print the metadata
     * associated with the variants marked in the mask, we need to create a
     * sorted list of line numbers we want.  So first we intersect the VID file
     * and the mask, then sort it.
     */
    /*
    FILE *vid_f = fopen(vid_file_name, "rb");
    if (!vid_f)
        err(EX_NOINPUT, "Cannot read file\"%s\"", vid_file_name);

    uint32_t *vids = (uint32_t *) malloc(num_fields*sizeof(uint32_t));
    if (!vids )
        err(EX_OSERR, "malloc error");

    size_t fr = fread(vids, sizeof(uint32_t), num_fields, vid_f);
    check_file_read(vid_file_name, vid_f, num_fields, fr);

    fclose(vid_f);
    */
    struct vid_file *vid_f = open_vid_file(vid_file_name);
    load_vid_data(vid_f);

    uint32_t i, j, masked_vid_count = 0;

    for (i = 0; i < mask_len; ++i)
        masked_vid_count += popcount(mask[i]);

    uint32_t *masked_vids = (uint32_t *)
            malloc(masked_vid_count*sizeof(uint32_t));
    if (!masked_vids )
        err(EX_OSERR, "malloc error");
    uint32_t masked_vid_i = 0;

    for (i = 0; i < mask_len; ++i) {
        uint32_t bytes = mask[i];
	if (bytes == 0)
            continue; /* skip a bunch of ops if you can */
        for (j = 0; j < 32; j++) {
            if (bytes & (1 << (31 - j))) {
                masked_vids[masked_vid_i] = vid_f->vids[i*32 + j];
                masked_vid_i+=1;
            }
        }
        if (masked_vid_i == masked_vid_count)
            break;
    }

    destroy_vid_file(vid_f);

    qsort(masked_vids, masked_vid_count, sizeof(uint32_t), compare_uint32_t);

    htsFile *fp    = hts_open(bcf_file_name,"rb");
    bcf_hdr_t *hdr = bcf_hdr_read(fp);
    bcf1_t *line    = bcf_init1();
    //bcf_hdr_set_samples(hdr, print_name_csv, 0);

    htsFile *out;
    if (!bcf_output)
        out = hts_open("-", "w");
    else
        out = hts_open("-", "wb");

    int r = bcf_hdr_write(out, hdr);

    uint32_t bcf_line_i = 0;
    masked_vid_i = 0;
    while ( bcf_read(fp, hdr, line) != -1) {
        if (masked_vids[masked_vid_i] == bcf_line_i) {
            r = bcf_unpack(line, BCF_UN_ALL);
            r = bcf_write1(out, hdr, line);
            masked_vid_i+=1;
        }
        if (masked_vid_i == masked_vid_count)
            break;
        bcf_line_i += 1;
    }

    hts_close(out);
    hts_close(fp);
}
Exemple #17
0
int main_vcfview(int argc, char *argv[])
{
	int i, c, clevel = -1, flag = 0, n_samples = -1, *imap = 0, excl_snp = 0, excl_indel = 0;
	char *fn_ref = 0, *fn_out = 0, moder[8], **samples = 0;
	bcf_hdr_t *h, *hsub = 0;
	htsFile *in;
	bcf1_t *b;

	while ((c = getopt(argc, argv, "l:bSt:o:T:s:GNI")) >= 0) {
		switch (c) {
		case 'l': clevel = atoi(optarg); flag |= 2; break;
		case 'S': flag |= 1; break;
		case 'b': flag |= 2; break;
		case 'G': n_samples = 0; break;
		case 't': fn_ref = optarg; flag |= 1; break;
		case 'o': fn_out = optarg; break;
		case 's': samples = hts_readlines(optarg, &n_samples); break;
		case 'N': excl_snp = 1; break;
		case 'I': excl_indel = 1; break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "\nUsage:   vcfview [options] <in.bcf>|<in.vcf>|<in.vcf.gz>\n\n");
		fprintf(stderr, "Options: -b           output in BCF\n");
		fprintf(stderr, "         -S           input is VCF\n");
		fprintf(stderr, "         -o FILE      output file name [stdout]\n");
		fprintf(stderr, "         -l INT       compression level [%d]\n", clevel);
		fprintf(stderr, "         -t FILE      list of reference names and lengths [null]\n");
		fprintf(stderr, "         -s FILE/STR  list of samples (STR if started with ':'; FILE otherwise) [null]\n");
		fprintf(stderr, "         -G           drop individual genotype information\n");
		fprintf(stderr, "         -N           exclude SNPs\n");
		fprintf(stderr, "         -I           exclude INDELs\n");
		fprintf(stderr, "\n");
		return 1;
	}
	strcpy(moder, "r");
	if ((flag&1) == 0 && !(file_type(argv[optind])&(IS_VCF|IS_VCF_GZ))) strcat(moder, "b");

	in = hts_open(argv[optind], moder, fn_ref);
	h = vcf_hdr_read(in);
	if (h == 0) {
		fprintf(stderr, "[E::%s] fail to read the VCF/BCF2 header\n", __func__);
		hts_close(in);
		return 1;
	}
	if (n_samples >= 0) {
		if (n_samples) imap = (int*)malloc(n_samples * sizeof(int));
		hsub = bcf_hdr_subset(h, n_samples, samples, imap);
	}
	b = bcf_init1();

	if ((flag&4) == 0) { // VCF/BCF output
		htsFile *out;
		char modew[8];
		strcpy(modew, "w");
		if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
		if (flag&2) strcat(modew, "b");
		out = hts_open(fn_out? fn_out : "-", modew, 0);
		vcf_hdr_write(out, hsub? hsub : h);
		if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
			hts_idx_t *idx;
			if ((idx = bcf_index_load(argv[optind])) == 0) {
				fprintf(stderr, "[E::%s] fail to load the BCF index\n", __func__);
				return 1;
			}
			for (i = optind + 1; i < argc; ++i) {
				hts_itr_t *iter;
				if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) {
					fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
					continue;
				}
				while (bcf_itr_next((BGZF*)in->fp, iter, b) >= 0) {
					if (excl_snp && bcf_is_snp(b)) continue;
					if (excl_indel && !bcf_is_snp(b)) continue;
					if (n_samples >= 0) {
						bcf_subset(h, b, n_samples, imap);
						vcf_write1(out, hsub, b);
					} else vcf_write1(out, h, b);
				}
				hts_itr_destroy(iter);
			}
			hts_idx_destroy(idx);
		} else {
			while (vcf_read1(in, h, b) >= 0) {
				if (excl_snp && bcf_is_snp(b)) continue;
				if (excl_indel && !bcf_is_snp(b)) continue;
				if (n_samples >= 0) {
					bcf_subset(h, b, n_samples, imap);
					vcf_write1(out, hsub, b);
				} else vcf_write1(out, h, b);
			}
		}
		hts_close(out);
	}

	bcf_destroy1(b);
	if (n_samples > 0) {
		for (i = 0; i < n_samples; ++i) free(samples[i]);
		free(samples);
		bcf_hdr_destroy(hsub);
		free(imap);
	}
	bcf_hdr_destroy(h);
	hts_close(in);
	return 0;
}
Exemple #18
0
int bcf_sr_next_line(readers_t *files)
{
    int32_t min_pos = INT_MAX;
    int ret,i,j;
    kstring_t *str = &files->tmps;

    while ( min_pos==INT_MAX )
    {
        // Need to open new chromosome?
        int eos = 0;
        for (i=0; i<files->nreaders; i++)
            if ( !files->readers[i].itr && !files->readers[i].nbuffer ) eos++;
        if ( eos==files->nreaders )
        {
            const char *seq;
            if ( files->targets )
            {
                seq = tgt_next_seq(files->targets);
                if ( !seq ) return 0;   // all chroms scanned
            }
            else
            {
                if ( ++files->iseq >= files->nseqs ) return 0;  // all chroms scanned
                seq = files->seqs[files->iseq];
            }
            for (i=0; i<files->nreaders; i++)
            {
                reader_t *reader = &files->readers[i];
                if ( reader->tbx )
                    reader->itr = tbx_itr_querys(reader->tbx,seq);
                else
                    reader->itr = bcf_itr_querys(reader->bcf,reader->header,seq);
            }
        }

        // Find the smallest coordinate
        for (i=0; i<files->nreaders; i++)
        {
            reader_t *reader = &files->readers[i];
            int buffer_full = ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) ? 1 : 0;
            if ( reader->itr && !buffer_full )
            {
                // Fill the buffer with records starting at the same position
                while (1)
                {
                    if ( reader->nbuffer+1 >= reader->mbuffer ) 
                    {
                        reader->mbuffer += 8;
                        reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
                        for (j=8; j>0; j--)
                            reader->buffer[reader->mbuffer-j] = bcf_init1();
                    }
                    if ( reader->tbx )
                    {
                        ret = tbx_itr_next((BGZF*)reader->file->fp, reader->tbx, reader->itr, str);
                        if ( ret<0 ) break;
                        vcf_parse1(str, reader->header, reader->buffer[reader->nbuffer+1]);
                    }
                    else
                    {
                        ret = bcf_itr_next((BGZF*)reader->file->fp, reader->itr, reader->buffer[reader->nbuffer+1]);
                        if ( ret<0 ) break;
                    }
                    bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
                    // apply filter
                    if ( reader->filter_id!=-1 && reader->buffer[reader->nbuffer+1]->d.n_flt && reader->filter_id!=reader->buffer[reader->nbuffer+1]->d.flt[0] ) continue;
                    set_variant_types(reader->buffer[reader->nbuffer+1]);
                    reader->nbuffer++;
                    if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break;
                }
                if ( ret<0 ) { tbx_itr_destroy(reader->itr); reader->itr = NULL; } // done for this chromosome
            }
            if ( reader->nbuffer )
            {
                if ( min_pos > reader->buffer[1]->pos ) min_pos = reader->buffer[1]->pos; 
            }
            // The buffer is full - either there is nothing else to read or the last record has a different coordinate
            if ( files->collapse && reader->nbuffer>2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
            {
                collapse_buffer(files, reader);
            }
        }
        if ( files->targets && min_pos!=INT_MAX )
        {
            int ret = tgt_has_position(files->targets, min_pos);
            if ( ret==1 ) continue;

            // The position must be skipped
            if ( ret==-1 )
            {
                // done for this chromosome, don't read the rest
                for (i=0; i<files->nreaders; i++) 
                {
                    files->readers[i].nbuffer = 0;
                    if ( files->readers[i].itr )
                    {
                        tbx_itr_destroy(files->readers[i].itr);
                        files->readers[i].itr = NULL;
                    }
                }
                min_pos = INT_MAX;
                continue;
            }

            // remove the active line, save the buffer line
            for (i=0; i<files->nreaders; i++)
            {
                reader_t *reader = &files->readers[i];
                for (j=1; j<=reader->nbuffer; j++)
                    if ( reader->buffer[j]->pos!=min_pos ) break;
                if ( j==1 ) continue;
                if ( j<=reader->nbuffer )
                {
                    bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[j]; reader->buffer[j] = tmp;
                    reader->nbuffer = 1;
                }
                else 
                    reader->nbuffer = 0;
            }
            min_pos = INT_MAX;
        }
    }

    //printf("[next_line] min_pos=%d\n", min_pos+1);
    //debug_buffers(files);

    // Set the current line
    ret = 0;
    bcf1_t *first = NULL;
    for (i=0; i<files->nreaders; i++)
    {
        reader_t *reader = &files->readers[i];
        if ( !reader->nbuffer || reader->buffer[1]->pos!=min_pos ) continue;

        // Match the records by REF and ALT
        int j, irec = -1;
        if ( first )
        {
            for (j=1; j<=reader->nbuffer; j++)
            {
                bcf1_t *line = reader->buffer[j];
                if ( min_pos != line->pos ) break;  // done with this buffer

                if ( files->collapse&COLLAPSE_ANY ) { irec=j; break; }  // checking position only
                if ( files->collapse&COLLAPSE_SNPS && first->d.var_type&VCF_SNP && line->d.var_type&VCF_SNP ) { irec=j; break; }
                if ( files->collapse&COLLAPSE_INDELS && first->d.var_type&VCF_INDEL && line->d.var_type&VCF_INDEL ) { irec=j; break; }

                if ( first->rlen != line->rlen ) continue;  // REFs do not match
                if ( strcmp(first->d.allele[0], line->d.allele[0]) ) continue; // REFs do not match
                int ial,jal;
                if ( files->collapse==COLLAPSE_NONE )
                {
                    // require exact match, all alleles must be identical
                    if ( first->n_allele!=line->n_allele ) continue;   // different number of alleles
                    int nmatch = 1; // REF has been already checked
                    for (ial=1; ial<first->n_allele; ial++)
                    {
                        for (jal=1; jal<line->n_allele; jal++)
                            if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { nmatch++; break; }
                    }
                    if ( nmatch>=first->n_allele ) { irec=j; break; }
                }
                else
                {
                    // thorough check: the REFs and some of the alleles have to be shared
                    // (neglecting different representations of the same indel for now)
                    for (ial=1; ial<first->n_allele; ial++)
                    {
                        for (jal=1; jal<line->n_allele; jal++)
                            if ( !strcmp(first->d.allele[ial], line->d.allele[jal]) ) { irec=j; break; }
                        if ( irec>=1 ) break;
                    }
                }
                if ( irec>=1 ) break;
            }
            if ( irec==-1 ) continue;
        }
        else 
        {
            first = reader->buffer[1];
            irec  = 1;
        }
        bcf1_t *tmp = reader->buffer[0];
        reader->buffer[0] = reader->buffer[irec];
        for (j=irec+1; j<=reader->nbuffer; j++)
            reader->buffer[j-1] = reader->buffer[j];
        reader->buffer[ reader->nbuffer ] = tmp;
        reader->nbuffer--;
        ret |= 1<<i;
    }
    // fprintf(stdout,"[next_line] min_pos=%d mask=%d\n", min_pos+1, ret);
    // debug_buffers(stdout,files);

    return ret;
}
Exemple #19
0
int main(int argc, char **argv)
{
    char *fname = argc>1 ? argv[1] : "/dev/null";
    htsFile *fp = hts_open(fname, "w");
    bcf_hdr_t *hdr1, *hdr2;

    hdr1 = bcf_hdr_init("w");
    hdr2 = bcf_hdr_init("w");

    // Add two shared and two private annotations
    bcf_hdr_append(hdr1, "##contig=<ID=1>");
    bcf_hdr_append(hdr1, "##contig=<ID=2>");
    bcf_hdr_append(hdr2, "##contig=<ID=2>");
    bcf_hdr_append(hdr2, "##contig=<ID=1>");
    bcf_hdr_append(hdr1, "##FILTER=<ID=FLT1,Description=\"Filter 1\">");
    bcf_hdr_append(hdr1, "##FILTER=<ID=FLT2,Description=\"Filter 2\">");
    bcf_hdr_append(hdr1, "##FILTER=<ID=FLT3,Description=\"Filter 3\">");
    bcf_hdr_append(hdr2, "##FILTER=<ID=FLT4,Description=\"Filter 4\">");
    bcf_hdr_append(hdr2, "##FILTER=<ID=FLT3,Description=\"Filter 3\">");
    bcf_hdr_append(hdr2, "##FILTER=<ID=FLT2,Description=\"Filter 2\">");
    bcf_hdr_append(hdr1, "##INFO=<ID=INF1,Number=.,Type=Integer,Description=\"Info 1\">");
    bcf_hdr_append(hdr1, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">");
    bcf_hdr_append(hdr1, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">");
    bcf_hdr_append(hdr2, "##INFO=<ID=INF4,Number=.,Type=Integer,Description=\"Info 4\">");
    bcf_hdr_append(hdr2, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">");
    bcf_hdr_append(hdr2, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">");
    bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT1,Number=.,Type=Integer,Description=\"FMT 1\">");
    bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">");
    bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">");
    bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT4,Number=.,Type=Integer,Description=\"FMT 4\">");
    bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">");
    bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">");
    bcf_hdr_add_sample(hdr1,"SMPL1");
    bcf_hdr_add_sample(hdr1,"SMPL2");
    bcf_hdr_add_sample(hdr2,"SMPL1");
    bcf_hdr_add_sample(hdr2,"SMPL2");
    bcf_hdr_sync(hdr1);
    bcf_hdr_sync(hdr2);

    hdr2 = bcf_hdr_merge(hdr2,hdr1);
    bcf_hdr_sync(hdr2);
    if ( bcf_hdr_write(fp, hdr2)!=0 ) error("Failed to write to %s\n", fname);

    bcf1_t *rec = bcf_init1();
    rec->rid = bcf_hdr_name2id(hdr1, "1");
    rec->pos = 0;
    bcf_update_alleles_str(hdr1, rec, "G,A");
    int32_t tmpi[3];
    tmpi[0] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT1");
    tmpi[1] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2");
    tmpi[2] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT3");
    bcf_update_filter(hdr1, rec, tmpi, 3);
    tmpi[0] = 1; bcf_update_info_int32(hdr1, rec, "INF1", tmpi, 1);
    tmpi[0] = 2; bcf_update_info_int32(hdr1, rec, "INF2", tmpi, 1);
    tmpi[0] = 3; bcf_update_info_int32(hdr1, rec, "INF3", tmpi, 1);
    tmpi[0] = tmpi[1] = 1; bcf_update_format_int32(hdr1, rec, "FMT1", tmpi, 2);
    tmpi[0] = tmpi[1] = 2; bcf_update_format_int32(hdr1, rec, "FMT2", tmpi, 2);
    tmpi[0] = tmpi[1] = 3; bcf_update_format_int32(hdr1, rec, "FMT3", tmpi, 2);

    bcf_remove_filter(hdr1, rec, bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"), 0);
    bcf_update_info_int32(hdr1, rec, "INF2", NULL, 0);
    bcf_update_format_int32(hdr1, rec, "FMT2", NULL, 0);

    bcf_translate(hdr2, hdr1, rec);
    if ( bcf_write(fp, hdr2, rec)!=0 ) error("Faild to write to %s\n", fname);

    // Clean
    bcf_destroy1(rec);
    bcf_hdr_destroy(hdr1);
    bcf_hdr_destroy(hdr2);
    int ret;
    if ( (ret=hts_close(fp)) )
    {
        fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
        exit(ret);
    }
    return 0;
}
Exemple #20
0
static void buffered_filters(args_t *args, bcf1_t *line)
{
    /**
     *  The logic of SnpGap=3. The SNPs at positions 1 and 7 are filtered,
     *  positions 0 and 8 are not:
     *           0123456789
     *      ref  .G.GT..G..
     *      del  .A.G-..A..
     *  Here the positions 1 and 6 are filtered, 0 and 7 are not:
     *           0123-456789
     *      ref  .G.G-..G..
     *      ins  .A.GT..A..
     *
     *  The logic of IndelGap=2. The second indel is filtered:
     *           012345678901
     *      ref  .GT.GT..GT..
     *      del  .G-.G-..G-..
     *  And similarly here, the second is filtered:
     *           01 23 456 78
     *      ref  .A-.A-..A-..
     *      ins  .AT.AT..AT..
     */

    // To avoid additional data structure, we abuse bcf1_t's var and var_type records.
    const int SnpGap_set     = VCF_OTHER<<1;
    const int IndelGap_set   = VCF_OTHER<<2;
    const int IndelGap_flush = VCF_OTHER<<3;

    int var_type = 0, i;
    if ( line )
    {
        // Still on the same chromosome?
        int ilast = rbuf_last(&args->rbuf);
        if ( ilast>=0 && line->rid != args->rbuf_lines[ilast]->rid )
            flush_buffer(args, args->rbuf.n); // new chromosome, flush everything

        rbuf_expand0(&args->rbuf,bcf1_t*,args->rbuf.n,args->rbuf_lines);

        // Insert the new record in the buffer. The line would be overwritten in
        // the next bcf_sr_next_line call, therefore we need to swap it with an
        // unused one
        ilast = rbuf_append(&args->rbuf);
        if ( !args->rbuf_lines[ilast] ) args->rbuf_lines[ilast] = bcf_init1();
        SWAP(bcf1_t*, args->files->readers[0].buffer[0], args->rbuf_lines[ilast]);

        var_type = bcf_get_variant_types(line);

        // Find out the size of an indel. The indel boundaries are based on REF
        // (POS+1,POS+rlen-1). This is not entirely correct: mpileup likes to
        // output REF=CAGAGAGAGA, ALT=CAGAGAGAGAGA where REF=C,ALT=CGA could be
        // used. This filter is therefore more strict and may remove some valid
        // SNPs.
        int len = 1;
        if ( var_type & VCF_INDEL )
        {
            for (i=1; i<line->n_allele; i++)
                if ( len < 1-line->d.var[i].n ) len = 1-line->d.var[i].n;
        }

        // Set the REF allele's length to max deletion length or to 1 if a SNP or an insertion.
        line->d.var[0].n = len;
    }

    int k_flush = 1;
    if ( args->indel_gap )
    {
        k_flush = 0;
        // Find indels which are too close to each other
        int last_to = -1;
        for (i=-1; rbuf_next(&args->rbuf,&i); )
        {
            bcf1_t *rec  = args->rbuf_lines[i];
            int rec_from = rec->pos;
            if ( last_to!=-1 && last_to < rec_from ) break;

            k_flush++;
            if ( !(rec->d.var_type & VCF_INDEL) ) continue;

            rec->d.var_type |= IndelGap_set;
            last_to = args->indel_gap + rec->pos + rec->d.var[0].n - 1;
        }
        if ( i==args->rbuf.f && line && last_to!=-1 ) k_flush = 0;
        if ( k_flush || !line )
        {
            // Select the best indel from the cluster of k_flush indels
            int k = 0, max_ac = -1, imax_ac = -1;
            for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
            {
                k++;
                bcf1_t *rec  = args->rbuf_lines[i];
                if ( !(rec->d.var_type & IndelGap_set) ) continue;
                hts_expand(int, rec->n_allele, args->ntmpi, args->tmpi);
                int ret = bcf_calc_ac(args->hdr, rec, args->tmpi, BCF_UN_ALL);
                if ( imax_ac==-1 || (ret && max_ac < args->tmpi[1]) ) { max_ac = args->tmpi[1]; imax_ac = i; }
            }

            // Filter all but the best indel (with max AF or first if AF not available)
            k = 0;
            for (i=-1; rbuf_next(&args->rbuf,&i) && k<k_flush; )
            {
                k++;
                bcf1_t *rec = args->rbuf_lines[i];
                if ( !(rec->d.var_type & IndelGap_set) ) continue;
                rec->d.var_type |= IndelGap_flush;
                if ( i!=imax_ac ) bcf_add_filter(args->hdr, args->rbuf_lines[i], args->IndelGap_id);
            }
        }
Exemple #21
0
int main(int argc, char **argv)
{
    int i, n;
    static struct option const long_opts[] =
    {
	{"out", required_argument, NULL, 1},
	{"report", required_argument, NULL, 2},
	{"dotasref", no_argument, NULL, 3},
	{"help", no_argument, NULL, 0},
	{"version", no_argument, NULL, 4},
	{"export_uncov", no_argument, NULL, 5}
    };
    bool help = FALSE;
    bool report_version = FALSE;
    while ((n = getopt_long(argc, argv, "1:2:304", long_opts, NULL)) >= 0)
    {
	switch (n)
	{
	case 1 : outfile = strdup(optarg); break;
	case 2 : report = strdup(optarg); break;
	case 3 : dotasref = TRUE; break;
	case 0 : help = TRUE; break;
	case 4 : report_version = TRUE; break;
	case 5 : export_uncover = TRUE; break;
	default : return 1;
	}
	if ( help ) return usage();
	if ( report_version ) return show_version();
    }
    n = argc - optind;
    if ( n > 1 ) errabort("only accept one input vcf");
    if ( export_uncover == TRUE && outfile == FALSE) {
	warnings("export uncove region only used with option --out");
	export_uncover = FALSE;
    }
    char * input;
    if ( n == 0 ) input = strdup("-");
    else input = strdup(argv[optind]);
    htsFile * fp = read_vcf_file(input);
    enum htsExactFormat fmt = hts_get_format(fp)->format;
    if ( fmt != vcf && fmt != bcf ) errabort("This is not a VCF/BCF file : %s", input);
    bcf_hdr_t * hdr = bcf_hdr_read(fp);
    int n_samples = bcf_hdr_nsamples(hdr);
    if ( n_samples != 2 ) errabort("the input VCF/BCF file must contain only two samples! %d", n_samples);
    LOG("Using sample %s as ref ...", hdr->samples[0]);
    LOG("Using sample %s as test ...", hdr->samples[1]);
    uint32_t matrix[4][4] = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} };
    bcf1_t * v = bcf_init1();
    kstring_t str = { 0, 0, 0 };
    uint32_t line = 0;
    htsFile *out = NULL;
    if ( outfile && !check_filename(outfile) ) out = hts_open(outfile, mode);
    if ( out != NULL ) bcf_hdr_write(out, hdr);
    while ( bcf_read1(fp, hdr, v) >= 0 )
    {
	bcf_unpack(v, BCF_UN_STR|BCF_UN_FMT);
	int k;
	str.l = 0;
	int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT");
	if ( !bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, tag_id) ) warnings("There is no 'GT' in the header!");
	for ( i = 0; i < v->n_fmt; ++i )
	    if ( v->d.fmt[i].id == tag_id ) break;
	if ( i == v->n_fmt ) {
	    vcf_format1(hdr, v, &str);
	    LOG("There is no tag GT in this line : %s", str.s);
	    continue;
	}
	corr_t xy[2] = { {-1, -2, -2}, {-1, -2, -2} };
	bcf_fmt_t * fmt = &v->d.fmt[i];

	for ( i = 0; i < 2; ++i )
	{
	    int corr = i;
	    if ( fmt == NULL ) {
		if ( dotasref == TRUE ) xy[corr].alt = ALT_IS_REF;
		else xy[corr].alt = ALT_IS_UNC;
		continue;
	    }
	    int last = -2;
	    uint8_t *d = (uint8_t*)((char*)fmt->p + fmt->size*i);
	    for ( k = 0; k < fmt->n && d[k] != (uint8_t)bcf_int8_vector_end; ++k )
	    {
		int curr = d[k]>>1;
		if ( last != curr ) {
		    if ( curr ) {
			if ( last == -2 ) xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF;
			else xy[corr].alt =  ALT_IS_HET;
		    } else {
			xy[corr].alt =  dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC;
		    }
		} else {
		    if ( curr ) {
			xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF;
		    } else {
			xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC;
		    }
		}
		if (last == -2 ) {
		    xy[corr].min = xy[corr].max = curr;
		} else {
		    if ( curr < xy[corr].min ) xy[corr].min = curr;
		    else if ( curr > xy[corr].max ) xy[corr].max = curr;
		}
		last = curr;
	    }
	}
	matrix[xy[0].alt][xy[1].alt]++;
	if ( xy[0].alt != xy[1].alt && out != NULL) {
	    if ( xy[0].alt == ALT_IS_UNC || xy[1].alt == ALT_IS_UNC ) {
		if ( export_uncover == TRUE ) {
		    str.l = 0;
		    vcf_format1(hdr, v, &str);
		    vcf_write(out, hdr, v);
		}
	    } else {
		str.l = 0;
		vcf_format1(hdr, v, &str);
		vcf_write(out, hdr, v);
	    }
	}
	if ( xy[0].alt == ALT_IS_HET && xy[1].alt == ALT_IS_HET && (xy[0].min != xy[1].min || xy[0].max != xy[1].max ) ) {
	    bias++;
	    matrix[ALT_IS_HET][ALT_IS_HET]--;
	    if ( out != NULL ) {
		str.l = 0;
		vcf_format1(hdr, v, &str);
		vcf_write(out, hdr, v);
	    }
	}
	line++;
    }
    if ( out ) hts_close(out);
    if ( str.m ) free(str.s);
    write_report(matrix, hdr);
    bcf_hdr_destroy(hdr);
    free(input);
    bcf_destroy1(v);
    if ( outfile ) free(outfile);
    if ( report ) free(report);
    if ( hts_close(fp) ) warnings("hts_close returned non-zero status: %s", input);
    return 0;
}
/*
 *  _reader_fill_buffer() - buffers all records with the same coordinate
 */
static void _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
{
    // Return if the buffer is full: the coordinate of the last buffered record differs
    if ( reader->nbuffer && reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) return;

    // No iterator (sequence not present in this file) and not streaming
    if ( !reader->itr && !files->streaming ) return;

    // Fill the buffer with records starting at the same position
    int i, ret = 0;
    while (1)
    {
        if ( reader->nbuffer+1 >= reader->mbuffer )
        {
            // Increase buffer size
            reader->mbuffer += 8;
            reader->buffer = (bcf1_t**) realloc(reader->buffer, sizeof(bcf1_t*)*reader->mbuffer);
            for (i=8; i>0; i--)     // initialize
            {
                reader->buffer[reader->mbuffer-i] = bcf_init1();
                reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
                reader->buffer[reader->mbuffer-i]->pos = -1;    // for rare cases when VCF starts from 1
            }
        }
        if ( files->streaming )
        {
            if ( reader->file->format.format==vcf )
            {
                if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break;   // no more lines
                int ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
                if ( ret<0 ) break;
            }
            else if ( reader->file->format.format==bcf )
            {
                if ( (ret=bcf_read1(reader->file, reader->header, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
            }
            else
            {
                fprintf(stderr,"[%s:%d %s] fixme: not ready for this\n", __FILE__,__LINE__,__FUNCTION__);
                exit(1);
            }
        }
        else if ( reader->tbx_idx )
        {
            if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break;  // no more lines
            vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]);
        }
        else
        {
            if ( (ret=bcf_itr_next(reader->file, reader->itr, reader->buffer[reader->nbuffer+1])) < 0 ) break; // no more lines
            bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
        }

        // apply filter
        if ( !reader->nfilter_ids )
            bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR);
        else
        {
            bcf_unpack(reader->buffer[reader->nbuffer+1], BCF_UN_STR|BCF_UN_FLT);
            if ( !has_filter(reader, reader->buffer[reader->nbuffer+1]) ) continue;
        }
        reader->nbuffer++;

        if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break;    // the buffer is full
    }
    if ( ret<0 )
    {
        // done for this region
        tbx_itr_destroy(reader->itr);
        reader->itr = NULL;
    }
    if ( files->collapse && reader->nbuffer>=2 && reader->buffer[1]->pos==reader->buffer[2]->pos )
        collapse_buffer(files, reader);
}
Exemple #23
0
/*
 * Performs pileup
 * @param conf configuration for this pileup
 * @param n number of files specified in fn
 * @param fn filenames
 */
static int mpileup(mplp_conf_t *conf, int n, char **fn)
{
    extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list);
    extern void bcf_call_del_rghash(void *rghash);
    mplp_aux_t **data;
    int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth;
    const bam_pileup1_t **plp;
    bam_mplp_t iter;
    bam_hdr_t *h = NULL; /* header of first file in input list */
    char *ref;
    void *rghash = NULL;
    FILE *pileup_fp = NULL;

    bcf_callaux_t *bca = NULL;
    bcf_callret1_t *bcr = NULL;
    bcf_call_t bc;
    htsFile *bcf_fp = NULL;
    bcf_hdr_t *bcf_hdr = NULL;

    bam_sample_t *sm = NULL;
    kstring_t buf;
    mplp_pileup_t gplp;

    memset(&gplp, 0, sizeof(mplp_pileup_t));
    memset(&buf, 0, sizeof(kstring_t));
    memset(&bc, 0, sizeof(bcf_call_t));
    data = calloc(n, sizeof(mplp_aux_t*));
    plp = calloc(n, sizeof(bam_pileup1_t*));
    n_plp = calloc(n, sizeof(int));
    sm = bam_smpl_init();

    if (n == 0) {
        fprintf(stderr,"[%s] no input file/data given\n", __func__);
        exit(1);
    }

    // read the header of each file in the list and initialize data
    for (i = 0; i < n; ++i) {
        bam_hdr_t *h_tmp;
        data[i] = calloc(1, sizeof(mplp_aux_t));
        data[i]->fp = sam_open(fn[i], "rb");
        if ( !data[i]->fp )
        {
            fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno));
            exit(1);
        }
        hts_set_fai_filename(data[i]->fp, conf->fai_fname);
        data[i]->conf = conf;
        h_tmp = sam_hdr_read(data[i]->fp);
        if ( !h_tmp ) {
            fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]);
            exit(1);
        }
        data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet
        bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text);
        // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search)
        rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list);
        if (conf->reg) {
            hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]);
            if (idx == 0) {
                fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]);
                exit(1);
            }
            if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) {
                fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg);
                exit(1);
            }
            if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end;
            hts_idx_destroy(idx);
        }
        if (i == 0) h = h_tmp; /* save the header of first file in list */
        else {
            // FIXME: to check consistency
            bam_hdr_destroy(h_tmp);
        }
    }
    // allocate data storage proportionate to number of samples being studied sm->n
    gplp.n = sm->n;
    gplp.n_plp = calloc(sm->n, sizeof(int));
    gplp.m_plp = calloc(sm->n, sizeof(int));
    gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*));

    fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n);
    // write the VCF header
    if (conf->flag & MPLP_BCF)
    {
        const char *mode;
        if ( conf->flag & MPLP_VCF )
            mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz";   // uncompressed VCF or compressed VCF
        else
            mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb";  // uncompressed BCF or compressed BCF

        bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode);
        if (bcf_fp == NULL) {
            fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
            exit(1);
        }

        bcf_hdr = bcf_hdr_init("w");
        kstring_t str = {0,0,0};

        ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version());
        bcf_hdr_append(bcf_hdr, str.s);

        str.l = 0;
        ksprintf(&str, "##samtoolsCommand=samtools mpileup");
        for (i=1; i<conf->argc; i++) ksprintf(&str, " %s", conf->argv[i]);
        kputc('\n', &str);
        bcf_hdr_append(bcf_hdr, str.s);

        if (conf->fai_fname)
        {
            str.l = 0;
            ksprintf(&str, "##reference=file://%s\n", conf->fai_fname);
            bcf_hdr_append(bcf_hdr, str.s);
        }

        // todo: use/write new BAM header manipulation routines, fill also UR, M5
        for (i=0; i<h->n_targets; i++)
        {
            str.l = 0;
            ksprintf(&str, "##contig=<ID=%s,length=%d>", h->target_name[i], h->target_len[i]);
            bcf_hdr_append(bcf_hdr, str.s);
        }
        free(str.s);
        bcf_hdr_append(bcf_hdr,"##ALT=<ID=X,Description=\"Represents allele(s) other than observed.\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
#if CDF_MWU_TESTS
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
#endif
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
        bcf_hdr_append(bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
        bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
        if ( conf->fmt_flag&B2B_FMT_DP )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
        if ( conf->fmt_flag&B2B_FMT_DV )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
        if ( conf->fmt_flag&B2B_FMT_DPR )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
        if ( conf->fmt_flag&B2B_INFO_DPR )
            bcf_hdr_append(bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
        if ( conf->fmt_flag&B2B_FMT_DP4 )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
        if ( conf->fmt_flag&B2B_FMT_SP )
            bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");

        for (i=0; i<sm->n; i++)
            bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]);
        bcf_hdr_add_sample(bcf_hdr, NULL);
        bcf_hdr_write(bcf_fp, bcf_hdr);

        bca = bcf_call_init(-1., conf->min_baseQ);
        bcr = calloc(sm->n, sizeof(bcf_callret1_t));
        bca->rghash = rghash;
        bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ;
        bca->min_frac = conf->min_frac;
        bca->min_support = conf->min_support;
        bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;

        bc.bcf_hdr = bcf_hdr;
        bc.n = sm->n;
        bc.PL = malloc(15 * sm->n * sizeof(*bc.PL));
        if (conf->fmt_flag)
        {
            assert( sizeof(float)==sizeof(int32_t) );
            bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4);
            bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields
            if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR) )
            {
                // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
                bc.DPR = malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t));
                for (i=0; i<sm->n; i++)
                    bcr[i].DPR = bc.DPR + (i+1)*B2B_MAX_ALLELES;
            }
        }
    }
    else {
        pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout;

        if (pileup_fp == NULL) {
            fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno));
            exit(1);
        }
    }

    if (tid0 >= 0 && conf->fai) { // region is set
        ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len);
        ref_tid = tid0;
        for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0;
    } else ref_tid = -1, ref = 0;

    // begin pileup
    iter = bam_mplp_init(n, mplp_func, (void**)data);
    if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter);
    max_depth = conf->max_depth;
    if (max_depth * sm->n > 1<<20)
        fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
    if (max_depth * sm->n < 8000) {
        max_depth = 8000 / sm->n;
        fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
    }
    max_indel_depth = conf->max_indel_depth * sm->n;
    bam_mplp_set_maxcnt(iter, max_depth);
    bcf1_t *bcf_rec = bcf_init1();
    int ret;
    while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) {
        if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested
        if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue;
        if (tid != ref_tid) {
            free(ref); ref = 0;
            if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len);
            for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid;
            ref_tid = tid;
        }
        if (conf->flag & MPLP_BCF) {
            int total_depth, _ref0, ref16;
            for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i];
            group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG);
            _ref0 = (ref && pos < ref_len)? ref[pos] : 'N';
            ref16 = seq_nt16_table[_ref0];
            bcf_callaux_clean(bca, &bc);
            for (i = 0; i < gplp.n; ++i)
                bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i);
            bc.tid = tid; bc.pos = pos;
            bcf_call_combine(gplp.n, bcr, bca, ref16, &bc);
            bcf_clear1(bcf_rec);
            bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0);
            bcf_write1(bcf_fp, bcf_hdr, bcf_rec);
            // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring?
            if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0)
            {
                bcf_callaux_clean(bca, &bc);
                for (i = 0; i < gplp.n; ++i)
                    bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i);
                if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) {
                    bcf_clear1(bcf_rec);
                    bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref);
                    bcf_write1(bcf_fp, bcf_hdr, bcf_rec);
                }
            }
        } else {
            fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N');
            for (i = 0; i < n; ++i) {
                int j, cnt;
                for (j = cnt = 0; j < n_plp[i]; ++j) {
                    const bam_pileup1_t *p = plp[i] + j;
                    if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt;
                }
                fprintf(pileup_fp, "\t%d\t", cnt);
                if (n_plp[i] == 0) {
                    fputs("*\t*", pileup_fp);
                    if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp);
                    if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp);
                } else {
                    for (j = 0; j < n_plp[i]; ++j) {
                        const bam_pileup1_t *p = plp[i] + j;
                        if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ)
                            pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref);
                    }
                    putc('\t', pileup_fp);
                    for (j = 0; j < n_plp[i]; ++j) {
                        const bam_pileup1_t *p = plp[i] + j;
                        int c = bam_get_qual(p->b)[p->qpos];
                        if (c >= conf->min_baseQ) {
                            c = c + 33 < 126? c + 33 : 126;
                            putc(c, pileup_fp);
                        }
                    }
                    if (conf->flag & MPLP_PRINT_MAPQ) {
                        putc('\t', pileup_fp);
                        for (j = 0; j < n_plp[i]; ++j) {
                            const bam_pileup1_t *p = plp[i] + j;
                            int c = bam_get_qual(p->b)[p->qpos];
                            if ( c < conf->min_baseQ ) continue;
                            c = plp[i][j].b->core.qual + 33;
                            if (c > 126) c = 126;
                            putc(c, pileup_fp);
                        }
                    }
                    if (conf->flag & MPLP_PRINT_POS) {
                        putc('\t', pileup_fp);
                        for (j = 0; j < n_plp[i]; ++j) {
                            if (j > 0) putc(',', pileup_fp);
                            fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow...
                        }
                    }
                }
            }
            putc('\n', pileup_fp);
        }
    }

    // clean up
    free(bc.tmp.s);
    bcf_destroy1(bcf_rec);
    if (bcf_fp)
    {
        hts_close(bcf_fp);
        bcf_hdr_destroy(bcf_hdr);
        bcf_call_destroy(bca);
        free(bc.PL);
        free(bc.DP4);
        free(bc.DPR);
        free(bc.fmt_arr);
        free(bcr);
    }
    if (pileup_fp && conf->output_fname) fclose(pileup_fp);
    bam_smpl_destroy(sm); free(buf.s);
    for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]);
    free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp);
    bcf_call_del_rghash(rghash);
    bam_mplp_destroy(iter);
    bam_hdr_destroy(h);
    for (i = 0; i < n; ++i) {
        sam_close(data[i]->fp);
        if (data[i]->iter) hts_itr_destroy(data[i]->iter);
        free(data[i]);
    }
    free(data); free(plp); free(ref); free(n_plp);
    return ret;
}
Exemple #24
0
void abcWriteBcf::print(funkyPars *pars){
  if(doBcf==0)
    return;
  kstring_t buf;
  if(fp==NULL){
    buf.s=NULL;buf.l=buf.m=0;
    fp=aio::openFileHts(outfiles,".bcf");
    hdr = bcf_hdr_init("w");
    rec    = bcf_init1();
    print_bcf_header(fp,hdr,args,buf,header);
  }
  lh3struct *lh3 = (lh3struct*) pars->extras[5];
  freqStruct *freq = (freqStruct *) pars->extras[6];
  genoCalls *geno = (genoCalls *) pars->extras[10];
  
  for(int s=0;s<pars->numSites;s++){
    if(pars->keepSites[s]==0)
      continue;

    rec->rid = bcf_hdr_name2id(hdr,header->target_name[pars->refId]);
    rec->pos = pars->posi[s];//<- maybe one index?
    //    bcf_update_id(hdr, rec, "rs6054257");
    char majmin[4]={intToRef[pars->major[s]],',',intToRef[pars->minor[s]],'\0'};
    bcf_update_alleles_str(hdr, rec, majmin);
    rec->qual = 29;
    // .. FILTER
    int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS");
    bcf_update_filter(hdr, rec, &tmpi, 1);
    // .. INFO
    
    tmpi = pars->keepSites[s];
    bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);

    if(pars->counts){
      int depth = 0;
      for(int i=0; i<4*pars->nInd; i++)
	depth += pars->counts[s][i];
      tmpi = depth;
      bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);

    }
    if(freq){
      float tmpf = freq->freq_EM[s];
      bcf_update_info_float(hdr, rec, "AF", &tmpf, 1);
    }
    
    // .. FORMAT
    assert(geno);
    if(geno){
      int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int32_t));
      for(int i=0; i<pars->nInd;i++){
	if(geno->dat[s][i]==0){
	  tmpia[2*i+0] = bcf_gt_unphased(0);
	  tmpia[2*i+1] = bcf_gt_unphased(0);
	}else if(geno->dat[s][i]==1){
	  tmpia[2*i+0] = bcf_gt_unphased(0);
	  tmpia[2*i+1] = bcf_gt_unphased(1);
	}  else{
	  tmpia[2*i+0] = bcf_gt_unphased(1);
	  tmpia[2*i+1] = bcf_gt_unphased(1);
	}
      }
      bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); 
      free(tmpia);
    }
    if(pars->counts){
      int32_t *tmpfa = (int32_t*)malloc(sizeof(int32_t)*bcf_hdr_nsamples(hdr));
      suint *ary=pars->counts[s];
      for(int i=0;i<bcf_hdr_nsamples(hdr);i++)
	tmpfa[i] = ary[0]+ary[1]+ary[2]+ary[3];
      bcf_update_format_int32(hdr, rec, "DP", tmpfa,bcf_hdr_nsamples(hdr) );
      free(tmpfa);
    }
    assert(lh3);
    if(lh3){
      float *tmpfa  =   (float*)malloc(3*bcf_hdr_nsamples(hdr)*sizeof(float  ));
      int32_t *tmpi = (int32_t*)malloc(3*bcf_hdr_nsamples(hdr)*sizeof(int32_t));
      double *ary = lh3->lh3[s];
      for(int i=0;i<bcf_hdr_nsamples(hdr);i++)
	for(int j=0;j<3;j++){
	  tmpfa[i*3+j] = ary[i*3+j]/M_LN10;
	  tmpi[i*3+j] =(int) -log10(exp(ary[i*3+j]))*10.0;
	  //	  fprintf(stderr,"pl:%d raw:%f\n",tmpi[i*3+j],ary[i*3+j]);
	}
      bcf_update_format_float(hdr, rec, "GL", tmpfa,3*bcf_hdr_nsamples(hdr) );
      bcf_update_format_int32(hdr, rec, "PL", tmpi,3*bcf_hdr_nsamples(hdr) );
      free(tmpfa);
      free(tmpi);
    }

    if ( bcf_write1(fp, hdr, rec)!=0 ){
      fprintf(stderr,"Failed to write to \n");
      exit(0);
    }
    //    fprintf(stderr,"------\n");
    bcf_clear1(rec);
  }
}
static void init_data(args_t *args)
{
    args->aux.srs = bcf_sr_init();

    // Open files for input and output, initialize structures
    if ( args->targets )
    {
        if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 )
            error("Failed to read the targets: %s\n", args->targets);

        if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED )
        {
            args->aux.srs->targets->missed_reg_handler = print_missed_line;
            args->aux.srs->targets->missed_reg_data = args;
        }
    }
    if ( args->regions )
    {
        if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
            error("Failed to read the targets: %s\n", args->regions);
    }

    int i;
    if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open: %s\n", args->bcf_fname);

    if ( args->nsamples && args->nsamples != bcf_hdr_nsamples(args->aux.srs->readers[0].header) )
    {
        args->samples_map = (int *) malloc(sizeof(int)*args->nsamples);
        args->aux.hdr = bcf_hdr_subset(args->aux.srs->readers[0].header, args->nsamples, args->samples, args->samples_map);
        for (i=0; i<args->nsamples; i++)
            if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]);
        if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n");
    }
    else
    {
        args->aux.hdr = bcf_hdr_dup(args->aux.srs->readers[0].header);
        for (i=0; i<args->nsamples; i++)
            if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
                error("No such sample: %s\n", args->samples[i]);
    }

    // Reorder ploidy and family indexes to match mpileup's output and exclude samples which are not available
    if ( args->aux.ploidy )
    {
        for (i=0; i<args->aux.nfams; i++)
        {
            int j;
            for (j=0; j<3; j++)
            {
                int k = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[ args->aux.fams[i].sample[j] ]);
                if ( k<0 ) error("No such sample: %s\n", args->samples[ args->aux.fams[i].sample[j] ]);
                args->aux.fams[i].sample[j] = k;
            }
        }
        uint8_t *ploidy = (uint8_t*) calloc(bcf_hdr_nsamples(args->aux.hdr), 1);
        for (i=0; i<args->nsamples; i++)    // i index in -s sample list
        {
            int j = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[i]);     // j index in the output VCF / subset VCF
            if ( j<0 )
            {
                fprintf(stderr,"Warning: no such sample: \"%s\"\n", args->samples[i]);
                continue;
            }
            ploidy[j] = args->aux.ploidy[i];
        }
        args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
        for (i=0; i<args->nsamples; i++)
            assert( ploidy[i]==0 || ploidy[i]==1 || ploidy[i]==2 );
        free(args->aux.ploidy);
        args->aux.ploidy = ploidy;
    }

    args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
    if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));

    if ( args->flag & CF_QCALL )
        return;

    if ( args->flag & CF_MCALL )
        mcall_init(&args->aux);

    if ( args->flag & CF_CCALL )
        ccall_init(&args->aux);

    if ( args->flag&CF_GVCF )
    {
        bcf_hdr_append(args->aux.hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
        args->gvcf.rid  = -1;
        args->gvcf.line = bcf_init1();
        args->gvcf.gt   = (int32_t*) malloc(2*sizeof(int32_t)*bcf_hdr_nsamples(args->aux.hdr));
        for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++)
        {
            args->gvcf.gt[2*i+0] = bcf_gt_unphased(0);
            args->gvcf.gt[2*i+1] = bcf_gt_unphased(0);
        }
    }

    bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
    bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");

    bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
    bcf_hdr_write(args->out_fh, args->aux.hdr);

    if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
}
Exemple #26
0
static void init_data(args_t *args)
{
    args->aux.srs = bcf_sr_init();

    // Open files for input and output, initialize structures
    if ( args->targets )
    {
        if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 )
            error("Failed to read the targets: %s\n", args->targets);

        if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED )
        {
            args->aux.srs->targets->missed_reg_handler = print_missed_line;
            args->aux.srs->targets->missed_reg_data = args;
        }
    }
    if ( args->regions )
    {
        if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
            error("Failed to read the targets: %s\n", args->regions);
    }

    if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum));
    args->aux.hdr = bcf_sr_get_header(args->aux.srs,0);

    int i;
    if ( args->samples_fname )
    {
        set_samples(args, args->samples_fname, args->samples_is_file);
        if ( args->aux.flag&CALL_CONSTR_TRIO )
        {
            if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname);
            fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams);
        }
        args->nsex = ploidy_nsex(args->ploidy);
        args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int));
        args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int));
        args->aux.ploidy = (uint8_t*) malloc(args->nsamples);
        for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2;
        for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2;
    }

    if ( args->samples_map )
    {
        args->aux.hdr = bcf_hdr_subset(bcf_sr_get_header(args->aux.srs,0), args->nsamples, args->samples, args->samples_map);
        if ( !args->aux.hdr ) error("Error occurred while subsetting samples\n");
        for (i=0; i<args->nsamples; i++)
            if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]);
        if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n");
    }
    else
    {
        args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0));
        for (i=0; i<args->nsamples; i++)
            if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
                error("No such sample: %s\n", args->samples[i]);
    }

    args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
    if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));

    if ( args->flag & CF_QCALL )
        return;

    if ( args->flag & CF_MCALL )
        mcall_init(&args->aux);

    if ( args->flag & CF_CCALL )
        ccall_init(&args->aux);

    if ( args->flag&CF_GVCF )
    {
        bcf_hdr_append(args->aux.hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
        args->gvcf.rid  = -1;
        args->gvcf.line = bcf_init1();
        args->gvcf.gt   = (int32_t*) malloc(2*sizeof(int32_t)*bcf_hdr_nsamples(args->aux.hdr));
        for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++)
        {
            args->gvcf.gt[2*i+0] = bcf_gt_unphased(0);
            args->gvcf.gt[2*i+1] = bcf_gt_unphased(0);
        }
    }

    bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
    bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");

    bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
    bcf_hdr_write(args->out_fh, args->aux.hdr);

    if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
}