Example #1
0
/* 
    Called once at startup, allows to initialize local variables.
    Return 1 to suppress VCF/BCF header from printing, 0 for standard
    VCF/BCF output and -1 on critical errors.
*/
int init(const char *opts, bcf_hdr_t *in, bcf_hdr_t *out)
{
    int i, id;

    in_hdr = in;
    tags = config_get_list(opts ? opts : "tags=PL,GL,GT","tags", &ntags);
    for (i=0; i<ntags; i++)
    {
        if ( !strcmp("PL",tags[i]) )
        {
            id = bcf_hdr_id2int(in_hdr,BCF_DT_ID,"PL");
            if ( bcf_hdr_idinfo_exists(in_hdr,BCF_HL_FMT,id) ) 
            {
                pl_type = bcf_hdr_id2type(in_hdr,BCF_HL_FMT,id);
                if ( pl_type!=BCF_HT_INT && pl_type!=BCF_HT_REAL ) 
                {
                    fprintf(stderr,"Expected numeric type of FORMAT/PL\n");
                    return -1;
                }
                handlers = (dosage_f*) realloc(handlers,(nhandlers+1)*sizeof(*handlers));
                handlers[nhandlers++] = calc_dosage_PL;
            }
        }
        else if ( !strcmp("GL",tags[i]) )
        {
            id = bcf_hdr_id2int(in_hdr,BCF_DT_ID,"GL");
            if ( bcf_hdr_idinfo_exists(in_hdr,BCF_HL_FMT,id) )
            {
                gl_type = bcf_hdr_id2type(in_hdr,BCF_HL_FMT,id);
                if ( gl_type!=BCF_HT_INT && gl_type!=BCF_HT_REAL ) 
                {
                    fprintf(stderr,"Expected numeric type of FORMAT/GL\n");
                    return -1;
                }
                handlers = (dosage_f*) realloc(handlers,(nhandlers+1)*sizeof(*handlers));
                handlers[nhandlers++] = calc_dosage_GL;
            }
        }
        else if ( !strcmp("GT",tags[i]) )
        {
            handlers = (dosage_f*) realloc(handlers,(nhandlers+1)*sizeof(*handlers));
            handlers[nhandlers++] = calc_dosage_GT;
        }
        else 
        {
            fprintf(stderr,"No handler for tag \"%s\"\n", tags[i]);
            return -1;
        }
    }
    free(tags[0]);
    free(tags);

    printf("#[1]CHROM\t[2]POS\t[3]REF\t[4]ALT");
    for (i=0; i<bcf_hdr_nsamples(in_hdr); i++) printf("\t[%d]%s", i+5,in_hdr->samples[i]);
    printf("\n");

    return 1;
}
Example #2
0
static int filters_init_func(filter_t *filter, int func_type, char **str, token_t *tok)
{
    char *e = *str;
    while ( *e && *e!=')' ) e++;
    if ( !*e ) error("Could not parse the expression, right bracket not found [...%s]\n", str);

    kstring_t tmp = {0,0,0};
    kputsn(*str, e-(*str), &tmp);
    (*str) += e-(*str)+1;

    tok->hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, tmp.s);
    if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) )
        error("[%s:%d %s] Error: the tag \"FORMAT/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);

    int fmt_type = bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id);
    if ( fmt_type!=BCF_HT_INT && fmt_type!=BCF_HT_REAL ) error("[%s:%d %s] Error: expected numeric tag with %s\n", tmp.s);

    switch (func_type)
    {
        case TOK_MAX: tok->setter = filters_set_format_max; break;
        case TOK_MIN: tok->setter = filters_set_format_min; break;
        case TOK_AVG: tok->setter = filters_set_format_avg; break;
        default: error("[%s:%d %s] Error: unknown func_type: %d\n", func_type);
    }
    tok->tok_type = TOK_VAL;
    tok->tag = tmp.s;

    return 0;
}
 void 
 VariantHeaderMerger<fields_forward_LUT_ordering, fields_reverse_LUT_ordering, samples_forward_LUT_ordering, samples_reverse_LUT_ordering>::
 add_header_fields_mapping(bcf_hdr_t* curr_header, unsigned input_vcf_idx)
 {
   assert(m_merged_vcf_header_ptr);
   for(auto j=0;j<curr_header->n[BCF_DT_ID];++j)
   {
     auto curr_id = &(curr_header->id[BCF_DT_ID][j]);
     for(auto bcf_hl_type : { BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT })
     {
       //id has been deleted - ignore
       if(!bcf_hdr_idinfo_exists(curr_header, bcf_hl_type, j))
         continue;
       bcf_hrec_t* hrec = bcf_hdr_id2hrec(curr_header, BCF_DT_ID, bcf_hl_type, j);
       if(hrec) //not deleted
       {
         const char* key = curr_id->key;
         auto merged_idx = bcf_hdr_id2int(m_merged_vcf_header_ptr.get(), BCF_DT_ID, key);
         assert(merged_idx >= 0 && merged_idx < m_merged_vcf_header_ptr->n[BCF_DT_ID]);
         assert(bcf_hdr_idinfo_exists(m_merged_vcf_header_ptr, bcf_hl_type, merged_idx));
         m_header_fields_LUT.add_input_merged_idx_pair(input_vcf_idx, j, merged_idx);
       }
     }
   }
 }
static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters)
{
    kstring_t str = {0,0,0};
    const char *tmp = filters, *prev = filters;
    int nout = 0, *out = NULL;
    while ( 1 )
    {
        if ( *tmp==',' || !*tmp )
        {
            out = (int*) realloc(out, (nout+1)*sizeof(int));
            if ( tmp-prev==1 && *prev=='.' )
                out[nout] = -1;
            else
            {
                str.l = 0;
                kputsn(prev, tmp-prev, &str);
                out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s);
            }
            nout++;
            if ( !*tmp ) break;
            prev = tmp+1;
        }
        tmp++;
    }
    if ( str.m ) free(str.s);
    *nfilters = nout;
    return out;
}
 void 
 VariantHeaderMerger<fields_forward_LUT_ordering, fields_reverse_LUT_ordering, samples_forward_LUT_ordering, samples_reverse_LUT_ordering>::
 store_merged_field_idx_for_enum(const string& field_name, unsigned field_enum)
 {
   if(field_enum >= m_num_enums_allocated)
   {
     m_num_enums_allocated = 2u*field_enum + 1u;
     m_merged_field_idx_enum_lut.resize_luts_if_needed(1u, m_num_enums_allocated);
   }
   auto val =  bcf_hdr_id2int(m_merged_vcf_header_ptr.get(), BCF_DT_ID, field_name.c_str());
   if(val == -1)
     m_merged_field_idx_enum_lut.reset_merged_idx_for_input(0u, field_enum);
   else
     m_merged_field_idx_enum_lut.add_input_merged_idx_pair(0u, field_enum, val);
 }
Example #6
0
static void init_data(args_t *args)
{
    args->header = args->files->readers[0].header;

    int i, nsamples = 0, *samples = NULL;
    if ( args->sample_list && strcmp("-",args->sample_list) )
    {
        for (i=0; i<args->files->nreaders; i++)
        {
            int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
            if ( ret<0 ) error("Error parsing the sample list\n");
            else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
        }

        if ( args->sample_list[0]!='^' )
        {
            // the sample ordering may be different if not negated
            int n;
            char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
            if ( !smpls ) error("Could not parse %s\n", args->sample_list);
            if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
                error("The number of samples does not match, perhaps some are present multiple times?\n");
            nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
            samples = (int*) malloc(sizeof(int)*nsamples);
            for (i=0; i<n; i++)
            {
                samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
                free(smpls[i]);
            }
            free(smpls);
        }
    }
    args->convert = convert_init(args->header, samples, nsamples, args->format_str);
    if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
    free(samples);

    int max_unpack = convert_max_unpack(args->convert);
    if ( args->filter_str )
    {
        args->filter = filter_init(args->header, args->filter_str);
        max_unpack |= filter_max_unpack(args->filter);
    }
    args->files->max_unpack = max_unpack;
}
Example #7
0
void bcf_hdr_merge(bcf_hdr_t *hw, const bcf_hdr_t *_hr, const char *clash_prefix)
{
    bcf_hdr_t *hr = (bcf_hdr_t*)_hr;

    // header lines
    int i, nw_ori = hw->nhrec;
    for (i=0; i<hr->nhrec; i++)
    {
        if ( hr->hrec[i]->type==BCF_HL_GEN && hr->hrec[i]->value )
        {
            int j;
            for (j=0; j<nw_ori; j++)
            {
                if ( hw->hrec[j]->type!=BCF_HL_GEN ) continue;
                if ( !strcmp(hr->hrec[i]->key,hw->hrec[j]->key) && !strcmp(hr->hrec[i]->value,hw->hrec[j]->value) ) break;
            }
            if ( j>=nw_ori )
                bcf_hdr_add_hrec(hw, bcf_hrec_dup(hr->hrec[i]));
        }
        else
        {
            bcf_hrec_t *rec = bcf_hdr_get_hrec(hw, hr->hrec[i]->type, hr->hrec[i]->vals[0]);
            if ( !rec )
                bcf_hdr_add_hrec(hw, bcf_hrec_dup(hr->hrec[i]));
        }
    }

    // samples
    for (i=0; i<bcf_hdr_nsamples(hr); i++)
    {
        char *name = hr->samples[i];
        if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 )
        {
            // there is a sample with the same name
            int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1;
            name = (char*) malloc(sizeof(char)*(len+1));
            sprintf(name,"%s:%s",clash_prefix,hr->samples[i]);
            bcf_hdr_add_sample(hw,name);
            free(name);
        }
        else
            bcf_hdr_add_sample(hw,name);
    }
}
 void 
 VariantHeaderMerger<fields_forward_LUT_ordering, fields_reverse_LUT_ordering, samples_forward_LUT_ordering, samples_reverse_LUT_ordering>::
 add_samples_mapping(bcf_hdr_t* curr_header, unsigned input_vcf_idx)
 {
   for(auto j=0;j<bcf_hdr_nsamples(curr_header);++j)
   {
     if(curr_header->samples[j] && bcf_hdr_id2int(curr_header, BCF_DT_SAMPLE, curr_header->samples[j]) >= 0)
     {
       if(m_sample2idx_merged.find(curr_header->samples[j]) == m_sample2idx_merged.end())
       {
         auto curr_size = m_sample2idx_merged.size();
         m_sample2idx_merged[curr_header->samples[j]] = curr_size;
       }
       m_samples_LUT.add_input_merged_idx_pair(input_vcf_idx, j, m_sample2idx_merged[curr_header->samples[j]]);
     }
     else
       m_samples_LUT.reset_merged_idx_for_input(input_vcf_idx, j);
   }
 }
Example #9
0
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{
    memset(&args,0,sizeof(args_t));
    args.in_hdr  = in;
    args.out_hdr = out;

    static struct option loptions[] =
    {
        {"tags",1,0,'t'},
        {0,0,0,0}
    };
    int c;
    while ((c = getopt_long(argc, argv, "?ht:T:l:cd",loptions,NULL)) >= 0)
    {
        switch (c) 
        {
            case 't': args.tags |= parse_tags(&args,optarg); break;
            case 'h':
            case '?':
            default: error("%s", usage()); break;
        }
    }
    if ( optind != argc ) error(usage());
    if ( !args.tags ) args.tags |= SET_AN|SET_AC|SET_NS|SET_AC_Hom|SET_AC_Het|SET_AC_Hemi|SET_AF;
    
    args.gt_id = bcf_hdr_id2int(args.in_hdr,BCF_DT_ID,"GT");
    if ( args.gt_id<0 ) error("Error: GT field is not present\n");

    if ( args.tags&SET_AN ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
    if ( args.tags&SET_AC ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">");
    if ( args.tags&SET_NS ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">");
    if ( args.tags&SET_AC_Hom ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Hom,Number=A,Type=Integer,Description=\"Allele counts in homozygous genotypes\">");
    if ( args.tags&SET_AC_Het ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Het,Number=A,Type=Integer,Description=\"Allele counts in heterozygous genotypes\">");
    if ( args.tags&SET_AC_Hemi ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Hemi,Number=A,Type=Integer,Description=\"Allele counts in hemizygous genotypes\">");
    if ( args.tags&SET_AF ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele frequency\">");

    return 0;
}
Example #10
0
void merge_filter(args_t *args, bcf1_t *out)
{
    bcf_srs_t *files = args->files;
    bcf_hdr_t *out_hdr = args->out_hdr;

    int i, ret;
    khiter_t kitr;
    strdict_t *tmph = args->tmph;
    kh_clear(strdict, tmph);

    maux_t *ma = args->maux;
    out->d.n_flt = 0;
    for (i=0; i<files->nreaders; i++)
    {
        if ( !ma->has_line[i]) continue;

        bcf_sr_t *reader = &files->readers[i];
        bcf1_t *line = reader->buffer[0];
        bcf_hdr_t *hdr = reader->header;
        bcf_unpack(line, BCF_UN_ALL);

        int k;
        for (k=0; k<line->d.n_flt; k++)
        {
            const char *flt = hdr->id[BCF_DT_ID][line->d.flt[k]].key;
            kitr = kh_get(strdict, tmph, flt);
            if ( kitr == kh_end(tmph) )
            {
                int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt);
                if ( id==-1 ) error("The filter not defined: %s\n", flt);
                hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt);
                ma->flt[out->d.n_flt] = id;
                out->d.n_flt++;
                kh_put(strdict, tmph, flt, &ret);
            }
        }
    }
Example #11
0
static void init_data(args_t *args)
{
    args->files = bcf_sr_init();
    args->files->require_index = 1;
    if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to open %s: %s\n", args->fname, bcf_sr_strerror(args->files->errnum));
    args->hdr = args->files->readers[0].header;
    args->isample = -1;
    if ( args->sample )
    {
        args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample);
        if ( args->isample<0 ) error("No such sample: %s\n", args->sample);
    }
    if ( args->haplotype && args->isample<0 )
    {
        if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n");
        args->isample = 0;
    }
    if ( args->mask_fname )
    {
        args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL);
        if ( !args->mask ) error("Failed to initialize mask regions\n");
    }
    // In case we want to store the chains
    if ( args->chain_fname )
    {
        args->fp_chain = fopen(args->chain_fname,"w");
        if ( ! args->fp_chain ) error("Failed to create %s: %s\n", args->chain_fname, strerror(errno));
        args->chain_id = 0;
    }
    rbuf_init(&args->vcf_rbuf, 100);
    args->vcf_buf = (bcf1_t**) calloc(args->vcf_rbuf.m, sizeof(bcf1_t*));
    if ( args->output_fname ) {
        args->fp_out = fopen(args->output_fname,"w");
        if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno));
    }
    else args->fp_out = stdout;
}
Example #12
0
static void check_gt(args_t *args)
{
    int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0;
    int fake_pls = args->no_PLs;

    // Initialize things: check which tags are defined in the header, sample names etc.
    if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname);
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) 
    {
        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
        fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
        fake_pls = 1;
    }

    FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
    print_header(args, fp);

    int tgt_isample = -1, query_isample = 0;
    if ( args->target_sample ) 
    {
        tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample);
        if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample);
    }
    if ( args->all_sites )
    {
        if ( tgt_isample==-1 ) 
        {
            fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]);
            tgt_isample = 0;
        }
    }
    if ( args->query_sample )
    {
        query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample);
        if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample);
    }
    if ( args->all_sites )
        fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]Coverage\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]);

    // Main loop
    while ( (ret=bcf_sr_next_line(args->files)) )
    {
        if ( ret!=2 ) continue;
        bcf1_t *sm_line = args->files->readers[0].buffer[0];    // the query file
        bcf1_t *gt_line = args->files->readers[1].buffer[0];    // the -g target file
        bcf_unpack(sm_line, BCF_UN_FMT);
        bcf_unpack(gt_line, BCF_UN_FMT);

        // Init mapping from target genotype index to the sample's PL fields
        int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2;
        if ( n_gt2ipl > m_gt2ipl )
        {
            m_gt2ipl = n_gt2ipl;
            gt2ipl   = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl);
        }
        if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue;

        // Target genotypes
        int ngt, npl;
        if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, &gt_arr, &ngt_arr)) <= 0 ) 
            error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
        ngt /= bcf_hdr_nsamples(args->gt_hdr);
        if ( ngt!=2 ) continue; // checking only diploid genotypes

        // Sample PLs
        if ( !fake_pls )
        {
            if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 )
                error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1);
            npl /= bcf_hdr_nsamples(args->sm_hdr);
        }
        else
            npl = fake_PLs(args, args->sm_hdr, sm_line);

        // Calculate likelihoods for all samples, assuming diploid genotypes

        // For faster access to genotype likelihoods (PLs) of the query sample
        int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl;
        double sum_pl = 0;  // for converting PLs to probs
        for (max_ipl=0; max_ipl<npl; max_ipl++) 
        {
            if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break;
            if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue;
            sum_pl += pow(10, -0.1*pl_ptr[max_ipl]);
        }
        if ( sum_pl==0 ) continue; // no PLs present

        // The main stats: concordance of the query sample with the target -g samples
        for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++)
        {
            int *gt_ptr = gt_arr + i*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( a<0 || b<0 ) continue; // missing genotypes
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file
            int igt_qry = gt2ipl[igt_tgt];  // corresponding genotype in query file
            if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue;   // genotype not present in query sample: haploid or missing
            args->lks[i] += log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); 
            args->sites[i]++; 
        }
        if ( args->all_sites )
        {
            // Print LKs at all sites for debugging
            int *gt_ptr = gt_arr + tgt_isample*ngt;
            if ( gt_ptr[1]==bcf_int32_vector_end ) continue;    // skip haploid genotypes
            int a = bcf_gt_allele(gt_ptr[0]);
            int b = bcf_gt_allele(gt_ptr[1]);
            if ( args->hom_only && a!=b ) continue; // heterozygous genotype
            fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1);
            for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]);
            fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : ".");

            int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample
            for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); 
            for (igt=0; igt<npl; igt++)   
                if ( pl_ptr[igt]==bcf_int32_vector_end ) break;
                else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, ".");
                else fprintf(fp, "\t%d", pl_ptr[igt]); 
            fprintf(fp, "\n"); 
        }
    }
    free(gt2ipl);
    free(gt_arr);
    free(args->pl_arr);
    free(args->tmp_arr);

    // Scale LKs and certainties
    int nsamples = bcf_hdr_nsamples(args->gt_hdr);
    double min = args->lks[0];
    for (i=1; i<nsamples; i++) if ( min>args->lks[i] ) min = args->lks[i];
    for (i=0; i<nsamples; i++) args->lks[i] = min ? args->lks[i] / min : 0;
    double max_avg = args->sites[0] ? args->lks[0]/args->sites[0] : 0;
    for (i=1; i<nsamples; i++) 
    {
        double val = args->sites[i] ? args->lks[i]/args->sites[i] : 0;
        if ( max_avg<val ) max_avg = val;
    }

    // Sorted output
    double **p = (double**) malloc(sizeof(double*)*nsamples);
    for (i=0; i<nsamples; i++) p[i] = &args->lks[i];
    qsort(p, nsamples, sizeof(int*), cmp_doubleptr);

    fprintf(fp, "# [1]CN\t[2]Concordance with %s (total)\t[3]Concordance (average)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]);
    for (i=0; i<nsamples; i++)
    {
        int idx = p[i] - args->lks;
        double avg = args->sites[idx] ? args->lks[idx]/args->sites[idx] : 0;
        fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", 1-args->lks[idx], 1-avg/max_avg, args->sites[idx], args->gt_hdr->samples[idx], i);
    }

    if ( args->plot )
    {
        fclose(fp);
        plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]);
    }
}
Example #13
0
static void cross_check_gts(args_t *args)
{
    int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0;
    unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day...
    int fake_pls = args->no_PLs, ignore_dp = 0;

    int i,j,k,idx, pl_warned = 0, dp_warned = 0;
    int32_t *dp_arr = NULL;
    int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL;
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) 
    {
        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
        fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
        fake_pls = 1;
    }
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1;

    FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout;
    print_header(args, fp);
    if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n");

    while ( bcf_sr_next_line(args->files) )
    {
        bcf1_t *line = args->files->readers[0].buffer[0];
        bcf_unpack(line, BCF_UN_FMT);

        int npl;
        if ( !fake_pls )
        {
            npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr);
            if ( npl<=0 ) { pl_warned++; continue; }
            npl /= nsamples;
        }
        else
            npl = fake_PLs(args, args->sm_hdr, line);
        if ( !ignore_dp && bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr) <= 0 ) { dp_warned++; continue; }

        if ( args->hom_only )
        {
            for (i=0; i<nsamples; i++)
                is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl);
        }

        double sum = 0; int nsum = 0;
        idx = 0;
        for (i=0; i<nsamples; i++)
        {
            int *ipl = &args->pl_arr[i*npl];
            if ( *ipl==-1 ) { idx += i; continue; } // missing genotype
            if ( !ignore_dp && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; }
            if ( args->hom_only && !is_hom[i] ) { idx += i; continue; }

            for (j=0; j<i; j++)
            {
                int *jpl = &args->pl_arr[j*npl];
                if ( *jpl==-1 ) { idx++; continue; } // missing genotype
                if ( !ignore_dp && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; }
                if ( args->hom_only && !is_hom[j] ) { idx++; continue; }

                int min_pl = INT_MAX;
                for (k=0; k<npl; k++)
                {
                    if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break;
                    if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; }
                    if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k];
                }
                if ( k!=npl ) { idx++; continue; }

                if ( args->all_sites ) { sum += min_pl; nsum++; }
                args->lks[idx] += min_pl;
                args->cnts[idx]++;

                if ( !ignore_dp )
                {
                    args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j];
                    dp[i] += dp_arr[i]; ndp[i]++;
                    dp[j] += dp_arr[j]; ndp[j]++;
                }
                else
                {
                    args->dps[idx]++;
                    dp[i]++; ndp[i]++;
                    dp[j]++; ndp[j]++;
                }
                idx++;
            }
        }
        if ( args->all_sites ) 
            fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0);
    }
    if ( dp_arr ) free(dp_arr);
    if ( args->pl_arr ) free(args->pl_arr);
    if ( args->tmp_arr ) free(args->tmp_arr);
    if ( is_hom ) free(is_hom);

    if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned);
    if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned);

    // Output samples sorted by average discordance
    double *score  = (double*) calloc(nsamples,sizeof(double));
    args->sites = (double*) calloc(nsamples,sizeof(double));
    idx = 0;
    for (i=0; i<nsamples; i++)
    {
        for (j=0; j<i; j++)
        {
            score[i] += args->lks[idx];
            score[j] += args->lks[idx];
            args->sites[i] += args->cnts[idx];
            args->sites[j] += args->cnts[idx];
            idx++;
        }
    }
    for (i=0; i<nsamples; i++) 
        if ( args->sites[i] ) score[i] /= args->sites[i];
    double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0;
    for (i=0; i<nsamples; i++) p[i] = &score[i];
    qsort(p, nsamples, sizeof(int*), cmp_doubleptr);
    // The average discordance gives the number of differing sites in % with -G1
    fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n");
    for (i=0; i<nsamples; i++)
    {
        idx = p[i] - score;
        double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0;
        double nsites = args->sites[idx]/(nsamples-1);
        avg_score += score[idx];
        fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i);
    }

    // Overall score: maximum absolute deviation from the average score
    fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n");
    fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]);    // idx still set
    free(p);
    free(score);
    free(dp);
    free(ndp);

    // Pairwise discordances
    fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
    idx = 0;
    for (i=0; i<nsamples; i++)
    {
        for (j=0; j<i; j++)
        {
            fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0, 
                    args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
            idx++;
        }
    }
    fclose(fp);
    if ( args->plot )
        plot_cross_check(args);
}
Example #14
0
int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which)
{
	int i;
	for (i=0; i<line->n_allele; i++) ac[i]=0;

	// Use INFO/AC,AN field only when asked
	if ( which&BCF_UN_INFO )
	{
		bcf_unpack(line, BCF_UN_INFO);
		int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN");
		int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC");
        int i, an=-1, ac_len=0, ac_type=0;
        uint8_t *ac_ptr=NULL;
		if ( an_id>=0 && ac_id>=0 )
		{
			for (i=0; i<line->n_info; i++)
			{
				bcf_info_t *z = &line->d.info[i];
				if ( z->key == an_id ) an = z->v1.i;
				else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; }
			}
        }
        if ( an>=0 && ac_ptr )
        {
			int nac = 0;
            #define BRANCH_INT(type_t) {        \
                type_t *p = (type_t *) ac_ptr;  \
                for (i=0; i<ac_len; i++)        \
                {                               \
                    ac[i+1] = p[i];             \
                    nac += p[i];                \
                }                               \
            }
            switch (ac_type) {
                case BCF_BT_INT8:  BRANCH_INT(int8_t); break;
                case BCF_BT_INT16: BRANCH_INT(int16_t); break;
                case BCF_BT_INT32: BRANCH_INT(int32_t); break;
                default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
            }
            #undef BRANCH_INT
            assert( an>=nac );  // sanity check for missing values
			ac[0] = an - nac;
			return 1;
        }
	}

	// Split genotype fields only when asked
	if ( which&BCF_UN_FMT )
	{
		int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT");
		if ( gt_id<0 ) return 0;
		bcf_unpack(line, BCF_UN_FMT);
		bcf_fmt_t *fmt_gt = NULL;
		for (i=0; i<(int)line->n_fmt; i++) 
			if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; }
		if ( !fmt_gt ) return 0;
        #define BRANCH_INT(type_t,missing,vector_end) { \
		    for (i=0; i<line->n_sample; i++) \
		    { \
                type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \
		    	int ial; \
		    	for (ial=0; ial<fmt_gt->n; ial++) \
		    	{ \
                    if ( p[ial]==vector_end ) break; /* smaller ploidy */ \
                    if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \
		    		ac[(p[ial]>>1)-1]++; \
		    	} \
		    } \
        }
        switch (fmt_gt->type) {
            case BCF_BT_INT8:  BRANCH_INT(int8_t,  bcf_int8_missing, bcf_int8_vector_end); break;
            case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break;
            case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break;
            default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break;
        }
        #undef BRANCH_INT
		return 1;
	}
	return 0;
}
Example #15
0
int main(int argc, char **argv)
{
    char *fname = argc>1 ? argv[1] : "/dev/null";
    htsFile *fp = hts_open(fname, "w");
    bcf_hdr_t *hdr1, *hdr2;

    hdr1 = bcf_hdr_init("w");
    hdr2 = bcf_hdr_init("w");

    // Add two shared and two private annotations
    bcf_hdr_append(hdr1, "##contig=<ID=1>");
    bcf_hdr_append(hdr1, "##contig=<ID=2>");
    bcf_hdr_append(hdr2, "##contig=<ID=2>");
    bcf_hdr_append(hdr2, "##contig=<ID=1>");
    bcf_hdr_append(hdr1, "##FILTER=<ID=FLT1,Description=\"Filter 1\">");
    bcf_hdr_append(hdr1, "##FILTER=<ID=FLT2,Description=\"Filter 2\">");
    bcf_hdr_append(hdr1, "##FILTER=<ID=FLT3,Description=\"Filter 3\">");
    bcf_hdr_append(hdr2, "##FILTER=<ID=FLT4,Description=\"Filter 4\">");
    bcf_hdr_append(hdr2, "##FILTER=<ID=FLT3,Description=\"Filter 3\">");
    bcf_hdr_append(hdr2, "##FILTER=<ID=FLT2,Description=\"Filter 2\">");
    bcf_hdr_append(hdr1, "##INFO=<ID=INF1,Number=.,Type=Integer,Description=\"Info 1\">");
    bcf_hdr_append(hdr1, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">");
    bcf_hdr_append(hdr1, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">");
    bcf_hdr_append(hdr2, "##INFO=<ID=INF4,Number=.,Type=Integer,Description=\"Info 4\">");
    bcf_hdr_append(hdr2, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">");
    bcf_hdr_append(hdr2, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">");
    bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT1,Number=.,Type=Integer,Description=\"FMT 1\">");
    bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">");
    bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">");
    bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT4,Number=.,Type=Integer,Description=\"FMT 4\">");
    bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">");
    bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">");
    bcf_hdr_add_sample(hdr1,"SMPL1");
    bcf_hdr_add_sample(hdr1,"SMPL2");
    bcf_hdr_add_sample(hdr2,"SMPL1");
    bcf_hdr_add_sample(hdr2,"SMPL2");
    bcf_hdr_sync(hdr1);
    bcf_hdr_sync(hdr2);

    hdr2 = bcf_hdr_merge(hdr2,hdr1);
    bcf_hdr_sync(hdr2);
    if ( bcf_hdr_write(fp, hdr2)!=0 ) error("Failed to write to %s\n", fname);

    bcf1_t *rec = bcf_init1();
    rec->rid = bcf_hdr_name2id(hdr1, "1");
    rec->pos = 0;
    bcf_update_alleles_str(hdr1, rec, "G,A");
    int32_t tmpi[3];
    tmpi[0] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT1");
    tmpi[1] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2");
    tmpi[2] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT3");
    bcf_update_filter(hdr1, rec, tmpi, 3);
    tmpi[0] = 1; bcf_update_info_int32(hdr1, rec, "INF1", tmpi, 1);
    tmpi[0] = 2; bcf_update_info_int32(hdr1, rec, "INF2", tmpi, 1);
    tmpi[0] = 3; bcf_update_info_int32(hdr1, rec, "INF3", tmpi, 1);
    tmpi[0] = tmpi[1] = 1; bcf_update_format_int32(hdr1, rec, "FMT1", tmpi, 2);
    tmpi[0] = tmpi[1] = 2; bcf_update_format_int32(hdr1, rec, "FMT2", tmpi, 2);
    tmpi[0] = tmpi[1] = 3; bcf_update_format_int32(hdr1, rec, "FMT3", tmpi, 2);

    bcf_remove_filter(hdr1, rec, bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"), 0);
    bcf_update_info_int32(hdr1, rec, "INF2", NULL, 0);
    bcf_update_format_int32(hdr1, rec, "FMT2", NULL, 0);

    bcf_translate(hdr2, hdr1, rec);
    if ( bcf_write(fp, hdr2, rec)!=0 ) error("Faild to write to %s\n", fname);

    // Clean
    bcf_destroy1(rec);
    bcf_hdr_destroy(hdr1);
    bcf_hdr_destroy(hdr2);
    int ret;
    if ( (ret=hts_close(fp)) )
    {
        fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret);
        exit(ret);
    }
    return 0;
}
Example #16
0
static void init_data(args_t *args)
{
    bcf_srs_t *files = bcf_sr_init();
    if ( args->regions_list )
    {
        if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 )
            error("Failed to read the regions: %s\n", args->regions_list);
    }
    if ( args->targets_list )
    {
        if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 )
            error("Failed to read the targets: %s\n", args->targets_list);
    }
    if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum));
    bcf_hdr_t *hdr = files->readers[0].header;
    if ( !args->sample )
    {
        if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n");
        args->sample = hdr->samples[0];
    }
    else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample);
    int ret = bcf_hdr_set_samples(hdr, args->sample, 0);
    if ( ret<0 ) error("Error setting the sample: %s\n", args->sample);

    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) )
        error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname);

    int i;
    args->xvals = (double*) calloc(args->nbins,sizeof(double));
    for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1);

    // collect BAF distributions for all chromosomes
    int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1;
    float *baf = NULL;
    while ( bcf_sr_next_line(files) )
    {
        ntotal++;

        bcf1_t *line = bcf_sr_get_line(files,0);
        if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue;
        if ( bcf_float_is_missing(baf[0]) ) continue;

        nprocessed++;

        if ( prev_chr==-1 || prev_chr!=line->rid )
        {
            // new chromosome
            idist = args->ndist++;
            args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist);
            memset(&args->dist[idist],0,sizeof(dist_t));
            args->dist[idist].chr   = strdup(bcf_seqname(hdr,line));
            args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double));
            args->dist[idist].xvals = args->xvals;
            args->dist[idist].nvals = args->nbins;
            prev_chr = line->rid;
        }
        int bin = baf[0]*(args->nbins-1);
        args->dist[idist].yvals[bin]++;   // the distribution
    }
    free(baf);
    bcf_sr_destroy(files);

    for (idist=0; idist<args->ndist; idist++)
    {
        #if 0
            int j;
            for (j=0; j<args->nbins; j++)
            {
                double x = args->dist[idist].xvals[j];
                args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3);
            }
        #endif
        init_dist(args, &args->dist[idist],args->verbose);
    }

    args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir);
    fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version());
    fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]);
    for (i=1; i<args->argc; i++)
        fprintf(args->dat_fp, " %s",args->argv[i]);
    fprintf(args->dat_fp,"\n#\n");
    fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n");
    fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n");
    fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n");

    char *fname = NULL;
    FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir);
//-------- matplotlib script --------------
    fprintf(fp,
        "#!/usr/bin/env python\n"
        "#\n"
        "import matplotlib as mpl\n"
        "mpl.use('Agg')\n"
        "import matplotlib.pyplot as plt\n"
        "import csv,sys,argparse\n"
        "from math import exp\n"
        "\n"
        "outdir = '%s'\n"
        "\n"
        "def read_dat(dat,fit,cn):\n"
        "   csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n"
        "   with open(outdir+'/dist.dat', 'rb') as f:\n"
        "      reader = csv.reader(f, 'tab')\n"
        "      for row in reader:\n"
        "          if row[0][0]=='#': continue\n"
        "          type = row[0]\n"
        "          chr  = row[1]\n"
        "          if type=='DIST':\n"
        "              if chr not in dat: dat[chr] = []\n"
        "              dat[chr].append(row)\n"
        "          elif type=='FIT':\n"
        "              if chr not in fit: fit[chr] = []\n"
        "              fit[chr].append(row)\n"
        "          elif type=='CN':\n"
        "              cn[chr] = row[2]\n"
        "\n"
        "def plot_dist(dat,fit,chr):\n"
        "   fig, ax = plt.subplots(1, 1, figsize=(7,5))\n"
        "   ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n"
        "   if chr in fit:\n"
        "       for i in range(len(fit[chr])):\n"
        "           pfit = fit[chr][i]\n"
        "           exec('def xfit(x): return '+pfit[5])\n"
        "           istart = int(pfit[3])\n"
        "           iend   = int(pfit[4])+1\n"
        "           vals   = dat[chr][istart:iend]\n"
        "           args   = {}\n"
        "           if i==0: args = {'label':'Target to Fit'}\n"
        "           ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n"
        "           if i==0: args = {'label':'Best Fit'}\n"
        "           ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n"
        "   ax.set_title('BAF distribution, chr'+chr)\n"
        "   ax.set_xlabel('BAF')\n"
        "   ax.set_ylabel('Frequency')\n"
        "   ax.legend(loc='best',prop={'size':7},frameon=False)\n"
        "   plt.savefig(outdir+'/dist.chr'+chr+'.png')\n"
        "   plt.close()\n"
        "\n"
        "def plot_copy_number(cn):\n"
        "   fig, ax = plt.subplots(1, 1, figsize=(7,5))\n"
        "   xlabels = sorted(cn.keys())\n"
        "   xvals = range(len(xlabels))\n"
        "   yvals = [float(cn[x]) for x in xlabels]\n"
        "   ax.plot(xvals,yvals,'o',color='red')\n"
        "   for i in range(len(xvals)):\n"
        "       if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n"
        "   ax.tick_params(axis='both', which='major', labelsize=9)\n"
        "   ax.set_xticks(xvals)\n"
        "   ax.set_xticklabels(xlabels,rotation=45)\n"
        "   ax.set_xlim(-1,len(xlabels))\n"
        "   ax.set_ylim(0,5.0)\n"
        "   ax.set_yticks([1.0,2.0,3.0,4.0])\n"
        "   ax.set_xlabel('Chromosome')\n"
        "   ax.set_ylabel('Copy Number')\n"
        "   plt.savefig(outdir+'/copy-number.png')\n"
        "   plt.close()\n"
        "\n"
        "class myParser(argparse.ArgumentParser):\n"
        "   def error(self, message):\n"
        "       self.print_help()\n"
        "       sys.stderr.write('error: %%s\\n' %% message)\n"
        "       sys.exit(2)\n"
        "\n"
        "def main():\n"
        "   parser = myParser()\n"
        "   parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n"
        "   parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n"
        "   parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n"
        "   args = parser.parse_args()\n"
        "   dat = {}; fit = {}; cn = {}\n"
        "   read_dat(dat,fit,cn)\n"
        "   if args.distrib!=None:\n"
        "       plot_dist(dat,fit,args.distrib)\n"
        "   if args.all:\n"
        "       for chr in dat: plot_dist(dat,fit,chr)\n"
        "       plot_copy_number(cn)\n"
        "   elif args.copy_number:\n"
        "       plot_copy_number(cn)\n"
        "   else:\n"
        "       for chr in dat: plot_dist(dat,fit,chr)\n"
        "\n"
        "if __name__ == '__main__':\n"
        "   main()\n",
        args->output_dir);
//---------------------------------------
    chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH);
    free(fname);
    fclose(fp);
}
Example #17
0
    void BlockQuantify::count()
    {
        _impl->fasta_to_use.reset(new FastaFile(_impl->ref_fasta));
#ifdef DEBUG_BLOCKQUANTIFY
        int lastpos = 0;
        std::cerr << "starting block." << "\n";
#endif
        auto current_bs_start = _impl->variants.begin();
        std::string current_chr;
        int current_bs = -1;
        bool current_bs_valid = false;

        // function to compute the QQ values for truth variants in the current
        // benchmarking superlocus
        const auto update_bs_filters = [this, &current_bs_start](BlockQuantifyImpl::variantlist_t::iterator to)
        {
            std::set<int> bs_filters;
            for(auto cur = current_bs_start; cur != to; ++cur)
            {
                for(int nf = 0; nf < (*cur)->d.n_flt; ++nf)
                {
                    const int f = (*cur)->d.flt[nf];
                    if(f != bcf_hdr_id2int(_impl->hdr, BCF_DT_ID, "PASS"))
                    {
                        bs_filters.insert(f);
                    }
                }
            }

            if(bs_filters.empty())
            {
                return;
            }

            for(auto cur = current_bs_start; cur != to; ++cur)
            {
                const std::string bdt = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 0);
                const std::string bvq = bcfhelpers::getFormatString(_impl->hdr, *cur, "BVT", 1);
                // filter TPs where the query call in NOCALL
                if(bdt == "TP" && bvq == "NOCALL")
                {
                    for(auto f : bs_filters)
                    {
                        bcf_add_filter(_impl->hdr, *cur, f);
                    }
                }
            }
        };

        // function to compute the QQ values for truth variants in the current
        // benchmarking superlocus
        const auto update_bs_qq = [this, &current_bs_start](BlockQuantifyImpl::variantlist_t::iterator to)
        {
            std::vector<float> tp_qqs;
            for(auto cur = current_bs_start; cur != to; ++cur)
            {
                const float qqq = bcfhelpers::getFormatFloat(_impl->hdr, *cur, "QQ", 1);
                if(std::isnan(qqq))
                {
                    continue;
                }
                const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 1);
                // we want the scores of all TPs in this BS
                if(bd == "TP")
                {
                    tp_qqs.push_back(qqq);
                }
            }

            float t_qq = bcfhelpers::missing_float();
            if(!tp_qqs.empty())
            {
                t_qq = *(std::min_element(tp_qqs.begin(), tp_qqs.end()));
            }

            /** compute the median over all variants */
            int fsize = bcf_hdr_nsamples(_impl->hdr);
            float * fmt = (float*)calloc((size_t) fsize, sizeof(float));
            for(auto cur = current_bs_start; cur != to; ++cur)
            {
                const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 0);
                bcf_get_format_float(_impl->hdr, *cur, "QQ", &fmt, &fsize);
                if(bd != "TP")
                {
                    fmt[0] = bcfhelpers::missing_float();
                }
                else
                {
                    const float qqq = bcfhelpers::getFormatFloat(_impl->hdr, *cur, "QQ", 1);
                    const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 1);
                    if(bd == "TP" && !std::isnan(qqq))
                    {
                        fmt[0] = qqq;
                    }
                    else
                    {
                        fmt[0] = t_qq;
                    }

                }
                bcf_update_format_float(_impl->hdr, *cur, "QQ", fmt, fsize);
            }
            free(fmt);

#ifdef DEBUG_BLOCKQUANTIFY
            const int bs = bcfhelpers::getInfoInt(_impl->hdr, *current_bs_start, "BS", -1);
            std::string values;
            for(float x : tp_qqs)
            {
                values += std::to_string(x) + ",";
            }
            std::cerr << "BS: " << bs << " T_QQ = " << t_qq << " [" << values << "]" << "\n";
#endif
        };

        const auto update_bs_conf_boundary_flag = [this, &current_bs_start](BlockQuantifyImpl::variantlist_t::iterator to)
        {
            static const int has_conf = 1;
            static const int has_non_conf = 2;
            int conf_non_conf = 0;
            for(auto cur = current_bs_start; cur != to; ++cur)
            {
                const std::string regions = bcfhelpers::getInfoString(_impl->hdr, *cur, "Regions", "");

                if(regions.find("CONF") == std::string::npos)
                {
                    conf_non_conf |= has_non_conf;
                }
                else
                {
                    conf_non_conf |= has_conf;
                }
                if(regions.find("TS_boundary") != std::string::npos)
                {
                    conf_non_conf |= has_non_conf | has_conf;
                }
            }

            for(auto cur = current_bs_start; cur != to; ++cur)
            {
                const std::string regions = bcfhelpers::getInfoString(_impl->hdr, *cur, "Regions", "");

                if(conf_non_conf == (has_conf | has_non_conf))
                {
                    if(regions.find("TS_boundary") == std::string::npos)
                    {
                        bcf_update_info_string(_impl->hdr,
                                               *cur, "Regions",
                                               (regions.empty() ? "TS_boundary" :
                                                regions + ",TS_boundary").c_str());
                    }
                }
                else if(conf_non_conf == has_conf)
                {
                    if(regions.find("TS_contained") == std::string::npos)
                    {
                        // also flag fully confident superloci
                        bcf_update_info_string(_impl->hdr,
                                               *cur, "Regions",
                                               (regions.empty() ? "TS_contained" :
                                                regions + ",TS_contained").c_str());
                    }
                }
            }
        };


        for(auto v_it = _impl->variants.begin(); v_it != _impl->variants.end(); ++v_it)
        {
            // update fields, must output GA4GH-compliant fields
            countVariants(*v_it);

            // determine benchmarking superlocus
            const std::string vchr = bcfhelpers::getChrom(_impl->hdr, *v_it);
            const int vbs = bcfhelpers::getInfoInt(_impl->hdr, *v_it, "BS");
            if(!current_bs_valid)
            {
                current_bs = vbs;
                current_chr = vchr;
                current_bs_valid = true;
            }

#ifdef DEBUG_BLOCKQUANTIFY
            std::cerr << "current BS = " << current_bs << " vbs = " << vbs << "\n";
#endif

            if(   current_bs_start != v_it
               && (vbs != current_bs || vbs < 0 || vchr != current_chr))
            {
#ifdef DEBUG_BLOCKQUANTIFY
                std::cerr << "finishing BS = " << current_bs << " vbs = " << vbs << "\n";
#endif
                update_bs_qq(v_it);
                update_bs_filters(v_it);
                update_bs_conf_boundary_flag(v_it);
                current_bs = vbs;
                current_chr = vchr;
                current_bs_start = v_it;
            }
        }

        // do final superlocus (if any)
        update_bs_qq(_impl->variants.end());
        update_bs_filters(_impl->variants.end());
        update_bs_conf_boundary_flag(_impl->variants.end());

        for(auto & v : _impl->variants)
        {
#ifdef DEBUG_BLOCKQUANTIFY
            lastpos = v->pos;
#endif
            // use BD and BVT to make ROCs
            rocEvaluate(v);
        }
#ifdef DEBUG_BLOCKQUANTIFY
        std::cerr << "finished block " << lastpos << " - " << _impl->variants.size() << " records on thread " << std::this_thread::get_id() << "\n";
#endif
        _impl->fasta_to_use.reset(nullptr);
    }
Example #18
0
static void init_data(args_t *args)
{
    args->prev_rid = args->skip_rid = -1;
    args->hdr = args->files->readers[0].header;

    if ( !args->sample )
    {
        if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n");
        args->sample = strdup(args->hdr->samples[0]);
    }
    if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");

    // Set samples
    kstring_t str = {0,0,0};
    if ( args->estimate_AF && strcmp("-",args->estimate_AF) )
    {
        int i, n;
        char **smpls = hts_readlist(args->estimate_AF, 1, &n);

        // Make sure the query sample is included
        for (i=0; i<n; i++)
            if ( !strcmp(args->sample,smpls[i]) ) break;

        // Add the query sample if not present
        if ( i!=n ) kputs(args->sample, &str);

        for (i=0; i<n; i++)
        {
            if ( str.l ) kputc(',', &str);
            kputs(smpls[i], &str);
            free(smpls[i]);
        }
        free(smpls);
    }
    else if ( !args->estimate_AF )
        kputs(args->sample, &str);

    if ( str.l )
    {
        int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
        if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
        else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
    }

    if ( args->af_tag )
        if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
            error("No such INFO tag in the VCF: %s\n", args->af_tag);

    args->nsmpl = bcf_hdr_nsamples(args->hdr);
    args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
    free(str.s);

    int i;
    for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.);

    // Init transition matrix and HMM
    double tprob[4];
    MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
    MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW;
    MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
    MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; 

    if ( args->genmap_fname ) 
    {
        args->hmm = hmm_init(2, tprob, 0);
        hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
    }
    else if ( args->rec_rate > 0 )
    {
        args->hmm = hmm_init(2, tprob, 0);
        hmm_set_tprob_func(args->hmm, set_tprob_recrate, args);

    }
    else
        args->hmm = hmm_init(2, tprob, 10000);

    // print header
    printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
    printf("# The command line was:\tbcftools %s", args->argv[0]);
    for (i=1; i<args->argc; i++)
        printf(" %s",args->argv[i]);
    printf("\n#\n");
    printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
}
Example #19
0
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok)
{
    tok->tok_type = TOK_VAL;
    tok->hdr_id   = -1;
    tok->pass     = -1;

    // is this a string constant?
    if ( str[0]=='"' || str[0]=='\'' )
    {
        int quote = str[0];
        if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str);
        tok->key = (char*) calloc(len-1,sizeof(char));
        tok->num_value = len-2;
        memcpy(tok->key,str+1,len-2);
        tok->key[len-2] = 0;
        return 0;
    }
    if ( !strncmp(str,"INFO/",5) ) { str += 5; len -= 5; }
    if ( !strncmp(str,"%QUAL",len) )
    {
        tok->setter = filters_set_qual;
        tok->tag = strdup("%QUAL");
        return 0;
    }
    if ( !strncmp(str,"%TYPE",len) )
    {
        tok->setter = filters_set_type;
        tok->tag = strdup("%TYPE");
        return 0;
    }
    if ( !strncmp(str,"%FILTER",len) )
    {
        tok->setter = filters_set_filter;
        tok->comparator = filters_cmp_filter;
        tok->tag = strdup("%FILTER");
        return 0;
    }

    // is this one of the VCF tags? For now do only INFO and QUAL, to be extended...
    kstring_t tmp = {0,0,0};
    kputsn(str, len, &tmp);

    tok->hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, tmp.s);
    if ( tok->hdr_id>=0 ) 
    {
        if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG )
            tok->setter = filters_set_info_flag;
        else
            tok->setter = filters_set_info;
        tok->tag = strdup(tmp.s);
        if ( tmp.s ) free(tmp.s);
        return 0;
    }

    // is it a substrict VCF vector tag?
    if ( tmp.s[tmp.l-1] == ']' )
    {
        int i;
        for (i=0; i<tmp.l; i++)
            if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; break; }

        tok->hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, tmp.s);
        if ( tok->hdr_id>=0 )
        {
            switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) ) 
            {
                case BCF_HT_INT:  tok->setter = &filters_set_info_int; break;
                case BCF_HT_REAL: tok->setter = &filters_set_info_float; break;
                default: error("FIXME: not ready for this, sorry\n");
            }
            tok->idx = atoi(&tmp.s[i+1]);
            if ( tmp.s ) free(tmp.s);
            return 0;
        }
    }

    // is it a value?
    char *end;
    errno = 0;
    tok->threshold = strtod(tmp.s, &end);
    if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s);

    if ( tmp.s ) free(tmp.s);
    return 0;
}
Example #20
0
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{
    memset(&args,0,sizeof(args_t));

    int i;

    static struct option loptions[] =
    {
        {"help",            no_argument,       0,'h'},
        {"sample-list",     required_argument, 0,'s'},
        {0,0,0,0}
    };

    char **smps_strs = NULL;

    int c;
    while ((c = getopt_long(argc, argv, "?s:h",loptions,NULL)) >= 0)
    {
        switch (c)
        {
            case 's': smps_strs = hts_readlist(optarg,0,&(args.n_sel_smps));
                      if ( args.n_sel_smps == 0 )
                      {
                          fprintf(stderr, "Sample specification not valid.\n");
                          error("%s", usage());
                      }
                      break;
            case 'h': usage(); break;
            case '?':
            default: error("%s", usage()); break;
        }
    }
    if ( optind != argc )  usage();  // too many files given

    args.hdr = bcf_hdr_dup(in);

    // Samples parsing from header and input option
    if ( !bcf_hdr_nsamples(args.hdr) )
    {
        error("No samples in input file.\n");
    }
    args.nsmp = bcf_hdr_nsamples(args.hdr);
    args.selected_smps = (int*) calloc(args.nsmp,sizeof(int));
    for ( i = 0; i < args.n_sel_smps; i++ )
    {
        int ind = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, smps_strs[i]);
        if ( ind == -1 )
        {
            error("Sample '%s' not in input vcf file.\n", smps_strs[i]);
        } else {
            args.selected_smps[ind] = 1;
        }
        free(smps_strs[i]);
    }
    free(smps_strs);

    /*
    fprintf(stderr, "Selected samples array:[");
    for (i=0;i<args.nsmp;i++)
    {
        fprintf(stderr, " %i", args.selected_smps[i]);
    }
    fprintf(stderr, " ]\n");
    */

    if ( bcf_hdr_id2int(args.hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header\n", __func__);

    args.gt_arr = NULL;

    return 0;
}
Example #21
0
int main(int argc, char **argv)
{
    int i, n;
    static struct option const long_opts[] =
    {
	{"out", required_argument, NULL, 1},
	{"report", required_argument, NULL, 2},
	{"dotasref", no_argument, NULL, 3},
	{"help", no_argument, NULL, 0},
	{"version", no_argument, NULL, 4},
	{"export_uncov", no_argument, NULL, 5}
    };
    bool help = FALSE;
    bool report_version = FALSE;
    while ((n = getopt_long(argc, argv, "1:2:304", long_opts, NULL)) >= 0)
    {
	switch (n)
	{
	case 1 : outfile = strdup(optarg); break;
	case 2 : report = strdup(optarg); break;
	case 3 : dotasref = TRUE; break;
	case 0 : help = TRUE; break;
	case 4 : report_version = TRUE; break;
	case 5 : export_uncover = TRUE; break;
	default : return 1;
	}
	if ( help ) return usage();
	if ( report_version ) return show_version();
    }
    n = argc - optind;
    if ( n > 1 ) errabort("only accept one input vcf");
    if ( export_uncover == TRUE && outfile == FALSE) {
	warnings("export uncove region only used with option --out");
	export_uncover = FALSE;
    }
    char * input;
    if ( n == 0 ) input = strdup("-");
    else input = strdup(argv[optind]);
    htsFile * fp = read_vcf_file(input);
    enum htsExactFormat fmt = hts_get_format(fp)->format;
    if ( fmt != vcf && fmt != bcf ) errabort("This is not a VCF/BCF file : %s", input);
    bcf_hdr_t * hdr = bcf_hdr_read(fp);
    int n_samples = bcf_hdr_nsamples(hdr);
    if ( n_samples != 2 ) errabort("the input VCF/BCF file must contain only two samples! %d", n_samples);
    LOG("Using sample %s as ref ...", hdr->samples[0]);
    LOG("Using sample %s as test ...", hdr->samples[1]);
    uint32_t matrix[4][4] = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} };
    bcf1_t * v = bcf_init1();
    kstring_t str = { 0, 0, 0 };
    uint32_t line = 0;
    htsFile *out = NULL;
    if ( outfile && !check_filename(outfile) ) out = hts_open(outfile, mode);
    if ( out != NULL ) bcf_hdr_write(out, hdr);
    while ( bcf_read1(fp, hdr, v) >= 0 )
    {
	bcf_unpack(v, BCF_UN_STR|BCF_UN_FMT);
	int k;
	str.l = 0;
	int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT");
	if ( !bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, tag_id) ) warnings("There is no 'GT' in the header!");
	for ( i = 0; i < v->n_fmt; ++i )
	    if ( v->d.fmt[i].id == tag_id ) break;
	if ( i == v->n_fmt ) {
	    vcf_format1(hdr, v, &str);
	    LOG("There is no tag GT in this line : %s", str.s);
	    continue;
	}
	corr_t xy[2] = { {-1, -2, -2}, {-1, -2, -2} };
	bcf_fmt_t * fmt = &v->d.fmt[i];

	for ( i = 0; i < 2; ++i )
	{
	    int corr = i;
	    if ( fmt == NULL ) {
		if ( dotasref == TRUE ) xy[corr].alt = ALT_IS_REF;
		else xy[corr].alt = ALT_IS_UNC;
		continue;
	    }
	    int last = -2;
	    uint8_t *d = (uint8_t*)((char*)fmt->p + fmt->size*i);
	    for ( k = 0; k < fmt->n && d[k] != (uint8_t)bcf_int8_vector_end; ++k )
	    {
		int curr = d[k]>>1;
		if ( last != curr ) {
		    if ( curr ) {
			if ( last == -2 ) xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF;
			else xy[corr].alt =  ALT_IS_HET;
		    } else {
			xy[corr].alt =  dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC;
		    }
		} else {
		    if ( curr ) {
			xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF;
		    } else {
			xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC;
		    }
		}
		if (last == -2 ) {
		    xy[corr].min = xy[corr].max = curr;
		} else {
		    if ( curr < xy[corr].min ) xy[corr].min = curr;
		    else if ( curr > xy[corr].max ) xy[corr].max = curr;
		}
		last = curr;
	    }
	}
	matrix[xy[0].alt][xy[1].alt]++;
	if ( xy[0].alt != xy[1].alt && out != NULL) {
	    if ( xy[0].alt == ALT_IS_UNC || xy[1].alt == ALT_IS_UNC ) {
		if ( export_uncover == TRUE ) {
		    str.l = 0;
		    vcf_format1(hdr, v, &str);
		    vcf_write(out, hdr, v);
		}
	    } else {
		str.l = 0;
		vcf_format1(hdr, v, &str);
		vcf_write(out, hdr, v);
	    }
	}
	if ( xy[0].alt == ALT_IS_HET && xy[1].alt == ALT_IS_HET && (xy[0].min != xy[1].min || xy[0].max != xy[1].max ) ) {
	    bias++;
	    matrix[ALT_IS_HET][ALT_IS_HET]--;
	    if ( out != NULL ) {
		str.l = 0;
		vcf_format1(hdr, v, &str);
		vcf_write(out, hdr, v);
	    }
	}
	line++;
    }
    if ( out ) hts_close(out);
    if ( str.m ) free(str.s);
    write_report(matrix, hdr);
    bcf_hdr_destroy(hdr);
    free(input);
    bcf_destroy1(v);
    if ( outfile ) free(outfile);
    if ( report ) free(report);
    if ( hts_close(fp) ) warnings("hts_close returned non-zero status: %s", input);
    return 0;
}
int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
{
    int i, j, nsmpl, free_smpl = 0;
    char **smpl = NULL;

    void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL;
    if ( exclude || strcmp("-",fname) ) // "-" stands for all samples
    {
        smpl = hts_readlist(fname, is_file, &nsmpl);
        if ( !smpl )
        {
            fprintf(stderr,"Could not read the file: \"%s\"\n", fname);
            return 0;
        }
        if ( exclude )
        {
            for (i=0; i<nsmpl; i++)
                khash_str2int_inc(exclude, smpl[i]);
        }
        free_smpl = 1;
    }
    if ( !smpl )
    {
        smpl  = files->readers[0].header->samples;   // intersection of all samples
        nsmpl = bcf_hdr_nsamples(files->readers[0].header);
    }

    files->samples = NULL;
    files->n_smpl  = 0;
    for (i=0; i<nsmpl; i++)
    {
        if ( exclude && khash_str2int_has_key(exclude,smpl[i])  ) continue;

        int n_isec = 0;
        for (j=0; j<files->nreaders; j++)
        {
            if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break;
            n_isec++;
        }
        if ( n_isec!=files->nreaders )
        {
            fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname);
            continue;
        }

        files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*));
        files->samples[files->n_smpl++] = strdup(smpl[i]);
    }

    if ( exclude ) khash_str2int_destroy(exclude);
    if ( free_smpl )
    {
        for (i=0; i<nsmpl; i++) free(smpl[i]);
        free(smpl);
    }

    if ( !files->n_smpl )
    {
        if ( files->nreaders>1 )
            fprintf(stderr,"No samples in common.\n");
        return 0;
    }
    for (i=0; i<files->nreaders; i++)
    {
        bcf_sr_t *reader = &files->readers[i];
        reader->samples  = (int*) malloc(sizeof(int)*files->n_smpl);
        reader->n_smpl   = files->n_smpl;
        for (j=0; j<files->n_smpl; j++)
            reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]);
    }
    return 1;
}
Example #23
0
void abcWriteBcf::print(funkyPars *pars){
  if(doBcf==0)
    return;
  kstring_t buf;
  if(fp==NULL){
    buf.s=NULL;buf.l=buf.m=0;
    fp=aio::openFileHts(outfiles,".bcf");
    hdr = bcf_hdr_init("w");
    rec    = bcf_init1();
    print_bcf_header(fp,hdr,args,buf,header);
  }
  lh3struct *lh3 = (lh3struct*) pars->extras[5];
  freqStruct *freq = (freqStruct *) pars->extras[6];
  genoCalls *geno = (genoCalls *) pars->extras[10];
  
  for(int s=0;s<pars->numSites;s++){
    if(pars->keepSites[s]==0)
      continue;

    rec->rid = bcf_hdr_name2id(hdr,header->target_name[pars->refId]);
    rec->pos = pars->posi[s];//<- maybe one index?
    //    bcf_update_id(hdr, rec, "rs6054257");
    char majmin[4]={intToRef[pars->major[s]],',',intToRef[pars->minor[s]],'\0'};
    bcf_update_alleles_str(hdr, rec, majmin);
    rec->qual = 29;
    // .. FILTER
    int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS");
    bcf_update_filter(hdr, rec, &tmpi, 1);
    // .. INFO
    
    tmpi = pars->keepSites[s];
    bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1);

    if(pars->counts){
      int depth = 0;
      for(int i=0; i<4*pars->nInd; i++)
	depth += pars->counts[s][i];
      tmpi = depth;
      bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1);

    }
    if(freq){
      float tmpf = freq->freq_EM[s];
      bcf_update_info_float(hdr, rec, "AF", &tmpf, 1);
    }
    
    // .. FORMAT
    assert(geno);
    if(geno){
      int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int32_t));
      for(int i=0; i<pars->nInd;i++){
	if(geno->dat[s][i]==0){
	  tmpia[2*i+0] = bcf_gt_unphased(0);
	  tmpia[2*i+1] = bcf_gt_unphased(0);
	}else if(geno->dat[s][i]==1){
	  tmpia[2*i+0] = bcf_gt_unphased(0);
	  tmpia[2*i+1] = bcf_gt_unphased(1);
	}  else{
	  tmpia[2*i+0] = bcf_gt_unphased(1);
	  tmpia[2*i+1] = bcf_gt_unphased(1);
	}
      }
      bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); 
      free(tmpia);
    }
    if(pars->counts){
      int32_t *tmpfa = (int32_t*)malloc(sizeof(int32_t)*bcf_hdr_nsamples(hdr));
      suint *ary=pars->counts[s];
      for(int i=0;i<bcf_hdr_nsamples(hdr);i++)
	tmpfa[i] = ary[0]+ary[1]+ary[2]+ary[3];
      bcf_update_format_int32(hdr, rec, "DP", tmpfa,bcf_hdr_nsamples(hdr) );
      free(tmpfa);
    }
    assert(lh3);
    if(lh3){
      float *tmpfa  =   (float*)malloc(3*bcf_hdr_nsamples(hdr)*sizeof(float  ));
      int32_t *tmpi = (int32_t*)malloc(3*bcf_hdr_nsamples(hdr)*sizeof(int32_t));
      double *ary = lh3->lh3[s];
      for(int i=0;i<bcf_hdr_nsamples(hdr);i++)
	for(int j=0;j<3;j++){
	  tmpfa[i*3+j] = ary[i*3+j]/M_LN10;
	  tmpi[i*3+j] =(int) -log10(exp(ary[i*3+j]))*10.0;
	  //	  fprintf(stderr,"pl:%d raw:%f\n",tmpi[i*3+j],ary[i*3+j]);
	}
      bcf_update_format_float(hdr, rec, "GL", tmpfa,3*bcf_hdr_nsamples(hdr) );
      bcf_update_format_int32(hdr, rec, "PL", tmpi,3*bcf_hdr_nsamples(hdr) );
      free(tmpfa);
      free(tmpi);
    }

    if ( bcf_write1(fp, hdr, rec)!=0 ){
      fprintf(stderr,"Failed to write to \n");
      exit(0);
    }
    //    fprintf(stderr,"------\n");
    bcf_clear1(rec);
  }
}
Example #24
0
VariantBuilder& VariantBuilder::set_chromosome(const std::string& chromosome) {
  // Note: we will validate the contig id (including checking for -1) at build time, if validation is turned on
  m_contig.set(bcf_hdr_id2int(m_header.m_header.get(), BCF_DT_CTG, chromosome.c_str()));
  return *this;
}
Example #25
0
static void init_data(args_t *args)
{
    args->sr = bcf_sr_init();
    if ( args->region )
    {
        args->sr->require_index = 1;
        if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n",args->region);
    }
    if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->target);
    if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum));
    args->hdr_in  = bcf_sr_get_header(args->sr,0);
    args->hdr_out = bcf_hdr_dup(args->hdr_in);

    if ( args->filter_str )
        args->filter = filter_init(args->hdr_in, args->filter_str);

    mkdir_p("%s/",args->output_dir);

    int i, nsmpl = bcf_hdr_nsamples(args->hdr_in);
    if ( !nsmpl ) error("No samples to split: %s\n", args->fname);
    args->fh = (htsFile**)calloc(nsmpl,sizeof(*args->fh));
    args->bnames = set_file_base_names(args);
    kstring_t str = {0,0,0};
    for (i=0; i<nsmpl; i++)
    {
        if ( !args->bnames[i] ) continue;
        str.l = 0;
        kputs(args->output_dir, &str);
        if ( str.s[str.l-1] != '/' ) kputc('/', &str);
        int k, l = str.l;
        kputs(args->bnames[i], &str);
        for (k=l; k<str.l; k++) if ( isspace(str.s[k]) ) str.s[k] = '_';
        if ( args->output_type & FT_BCF ) kputs(".bcf", &str);
        else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str);
        else kputs(".vcf", &str);
        args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type));
        if ( args->fh[i] == NULL ) error("Can't write to \"%s\": %s\n", str.s, strerror(errno));
        bcf_hdr_nsamples(args->hdr_out) = 1;
        args->hdr_out->samples[0] = args->bnames[i];
        bcf_hdr_write(args->fh[i], args->hdr_out);
    }
    free(str.s);

    // parse tags
    int is_info = 0, is_fmt = 0;
    char *beg = args->keep_tags;
    while ( beg && *beg )
    {
        if ( !strncasecmp("INFO/",beg,5) ) { is_info = 1; is_fmt = 0; beg += 5; }
        else if ( !strcasecmp("INFO",beg) ) { args->keep_info = 1; break; }
        else if ( !strncasecmp("INFO,",beg,5) ) { args->keep_info = 1; beg += 5; continue; }
        else if ( !strncasecmp("FMT/",beg,4) ) { is_info = 0; is_fmt = 1; beg += 4; }
        else if ( !strncasecmp("FORMAT/",beg,7) ) { is_info = 0; is_fmt = 1; beg += 7; }
        else if ( !strcasecmp("FMT",beg) ) { args->keep_fmt = 1; break; }
        else if ( !strcasecmp("FORMAT",beg) ) { args->keep_fmt = 1; break; }
        else if ( !strncasecmp("FMT,",beg,4) ) { args->keep_fmt = 1; beg += 4; continue; }
        else if ( !strncasecmp("FORMAT,",beg,7) ) { args->keep_fmt = 1; beg += 7; continue; }
        char *end = beg;
        while ( *end && *end!=',' ) end++;
        char tmp = *end; *end = 0;
        int id = bcf_hdr_id2int(args->hdr_in, BCF_DT_ID, beg);
        beg = tmp ? end + 1 : end;
        if ( is_info && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_INFO,id) )
        {
            if ( id >= args->ninfo_tags ) args->ninfo_tags = id + 1;
            hts_expand0(uint8_t, args->ninfo_tags, args->minfo_tags, args->info_tags);
            args->info_tags[id] = 1;
        }
        if ( is_fmt && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_FMT,id) )
        {
            if ( id >= args->nfmt_tags ) args->nfmt_tags = id + 1;
            hts_expand0(uint8_t, args->nfmt_tags, args->mfmt_tags, args->fmt_tags);
            args->fmt_tags[id] = 1;
        }
    }
    if ( !args->keep_info && !args->keep_fmt && !args->ninfo_tags && !args->nfmt_tags )
    {
        args->keep_info = args->keep_fmt = 1;
    }
}
Example #26
0
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm
filter_t *filter_init(bcf_hdr_t *hdr, const char *str)
{
    filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t));
    filter->str = strdup(str);
    filter->hdr = hdr;

    int nops = 0, mops = 0, *ops = NULL;    // operators stack
    int nout = 0, mout = 0;                 // filter tokens, RPN
    token_t *out = NULL;
    char *tmp = filter->str;
    int last_op = -1;
    while ( *tmp )
    {
        int len, ret;
        ret = filters_next_token(&tmp, &len);
        if ( ret==-1 ) error("Missing quotes in: %s\n", str);

        // fprintf(stderr,"token=[%c] .. [%s] %d\n", "x()[<=>]!|&+-*/Mm"[ret], tmp, len);
        // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", "x()[<=>]!|&+-*/Mm"[ops[i]]); fprintf(stderr,"\n");

        if ( ret==TOK_MAX || ret==TOK_MIN || ret==TOK_AVG )
        {
            nout++;
            hts_expand0(token_t, nout, mout, out);
            filters_init_func(filter, ret, &tmp, &out[nout-1]);
        }
        else if ( ret==TOK_LFT )         // left bracket
        {
            nops++;
            hts_expand(int, nops, mops, ops);
            ops[nops-1] = ret;
        }
        else if ( ret==TOK_RGT )    // right bracket
        {
            while ( nops>0 && ops[nops-1]!=TOK_LFT )
            {
                nout++;
                hts_expand0(token_t, nout, mout, out);
                out[nout-1].tok_type = ops[nops-1];
                nops--;
            }
            if ( nops<=0 ) error("Could not parse: %s\n", str);
            nops--;
        }
        else if ( ret!=TOK_VAL )    // one of the operators
        {
            // detect unary minus: replace -value with -1*(value)
            if ( ret==TOK_SUB && last_op!=TOK_VAL && last_op!=TOK_RGT )
            {
                nout++;
                hts_expand0(token_t, nout, mout, out);
                token_t *tok = &out[nout-1];
                tok->tok_type  = TOK_VAL;
                tok->hdr_id    = -1;
                tok->pass      = -1;
                tok->threshold = -1.0;
                ret = TOK_MULT;
            }
            else
            {
                while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] )
                {
                    nout++;
                    hts_expand0(token_t, nout, mout, out);
                    out[nout-1].tok_type = ops[nops-1];
                    nops--;
                }
            }
            nops++;
            hts_expand(int, nops, mops, ops);
            ops[nops-1] = ret;
        }
        else if ( !len ) 
        {
            if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str);
            break;     // all tokens read
        }
        else           // annotation name or filtering value
        {
            nout++;
            hts_expand0(token_t, nout, mout, out);
            filters_init1(filter, tmp, len, &out[nout-1]);
            tmp += len;
        }
        last_op = ret;
    }
    while ( nops>0 )
    {
        if ( ops[nops-1]==TOK_LFT || ops[nops-1]==TOK_RGT ) error("Could not parse the expression: [%s]\n", filter->str);
        nout++;
        hts_expand0(token_t, nout, mout, out);
        out[nout-1].tok_type = ops[nops-1];
        nops--;
    }

    // In the special cases of %TYPE and %FILTER the BCF header IDs are yet unknown. Walk through the
    // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be
    // just before or after the %FILTER token and they must be followed with a comparison operator.
    // This code is fragile: improve me.
    int i;
    for (i=0; i<nout; i++)
    {
        if ( out[i].tok_type!=TOK_VAL ) continue;
        if ( !out[i].tag ) continue;
        if ( !strcmp(out[i].tag,"%TYPE") )
        {
            if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
            int j = i+1;
            if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
            if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n",  __FILE__,__LINE__,__FUNCTION__, filter->str);
            if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) out[j].threshold = VCF_SNP;
            else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) out[j].threshold = VCF_INDEL;
            else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) out[j].threshold = VCF_MNP;
            else if ( !strcasecmp(out[j].key,"other") ) out[j].threshold = VCF_OTHER;
            else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str);
            out[j].tag = out[j].key; out[j].key = NULL;
            i = j;
            continue;
        }
        if ( !strcmp(out[i].tag,"%FILTER") )
        {
            if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str);
            int j = i+1;
            if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1;
            if ( out[j].tok_type!=TOK_VAL || !out[j].key )
                error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str);
            if ( strcmp(".",out[j].key) )
            {
                out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key);
                if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) )
                    error("The filter \"%s\" not present in the VCF header\n", out[j].key);
            }
            else
                out[j].hdr_id = -1;
            out[j].tag = out[j].key; out[j].key = NULL;
            out[i].hdr_id = out[j].hdr_id;
            i = j;
            continue;
        }
    }

    // filter_debug_print(out, nout);

    if ( mops ) free(ops);
    filter->filters   = out;
    filter->nfilters  = nout;
    filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout);
    return filter;
}
Example #27
0
static void init_data(args_t *args)
{
    bcf1_t *line = NULL;

    // With phased concat, the chunks overlap and come in the right order.  To
    // avoid opening all files at once, store start positions to recognise need
    // for the next one. This way we can keep only two open chunks at once.
    if ( args->phased_concat )
    {
        args->start_pos = (int*) malloc(sizeof(int)*args->nfnames);
        line = bcf_init();
    }

    kstring_t str = {0,0,0};
    int i, prev_chrid = -1;
    for (i=0; i<args->nfnames; i++)
    {
        htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
        bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
        args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr);
        if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) )
            error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);

        int j;
        for (j=0; j<bcf_hdr_nsamples(hdr); j++)
            if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) )
                error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]);

        if ( args->phased_concat )
        {
            int ret = bcf_read(fp, hdr, line);
            if ( ret!=0 ) args->start_pos[i] = -2;  // empty file
            else
            {
                int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line));
                args->start_pos[i] = chrid==prev_chrid ? line->pos : -1;
                prev_chrid = chrid;
            }
        }
        bcf_hdr_destroy(hdr);
        hts_close(fp);
    }
    free(str.s);
    if ( line ) bcf_destroy(line);

    args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int));

    if ( args->phased_concat )
    {
        bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">");
        bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">");
    }
    if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat");
    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
    if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
    if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);

    bcf_hdr_write(args->out_fh, args->out_hdr);

    if ( args->allow_overlaps )
    {
        args->files = bcf_sr_init();
        args->files->require_index = 1;
        if ( args->regions_list )
        {
            if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 )
                error("Failed to read the regions: %s\n", args->regions_list);
        }
        if ( args->remove_dups )
        {
            if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS;
            else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS;
            else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
            else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY;
            else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY;
            else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE;
            else error("The -D string \"%s\" not recognised.\n", args->remove_dups);
        }
        for (i=0; i<args->nfnames; i++)
            if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum));
    }
    else if ( args->phased_concat )
    {
        // Remove empty files from the list
        int nok = 0;
        while (1)
        {
            while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++;
            if ( nok==args->nfnames ) break;

            i = nok;
            while ( i<args->nfnames && args->start_pos[i]==-2 ) i++;
            if ( i==args->nfnames ) break;

            int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp;
            char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str;
        }
        for (i=nok; i<args->nfnames; i++) free(args->fnames[i]);
        args->nfnames = nok;

        for (i=1; i<args->nfnames; i++)
            if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] )
                error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]);

        args->prev_chr = -1;
        args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
        args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
        args->nmism  = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int));
        args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
        args->phase_set  = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t));
        args->files = bcf_sr_init();
        args->files->require_index = 1;
        args->ifname = 0;
    }
}
Example #28
0
int run(int argc, char **argv)
{
    char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL;
    memset(&args,0,sizeof(args_t));
    args.mode = 0;
    args.output_fname = "-";

    static struct option loptions[] =
    {
        {"trio",1,0,'t'},
        {"trio-file",1,0,'T'},
        {"delete",0,0,'d'},
        {"list",1,0,'l'},
        {"count",0,0,'c'},
        {"rules",1,0,'r'},
        {"rules-file",1,0,'R'},
        {"output",required_argument,NULL,'o'},
        {"output-type",required_argument,NULL,'O'},
        {0,0,0,0}
    };
    int c;
    while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0)
    {
        switch (c) 
        {
            case 'o': args.output_fname = optarg; break;
            case 'O':
                      switch (optarg[0]) {
                          case 'b': args.output_type = FT_BCF_GZ; break;
                          case 'u': args.output_type = FT_BCF; break;
                          case 'z': args.output_type = FT_VCF_GZ; break;
                          case 'v': args.output_type = FT_VCF; break;
                          default: error("The output type \"%s\" not recognised\n", optarg);
                      };
                      break;
            case 'R': rules_fname = optarg; break;
            case 'r': rules_string = optarg; break;
            case 'd': args.mode |= MODE_DELETE; break;
            case 'c': args.mode |= MODE_COUNT; break;
            case 'l': 
                if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; 
                else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; 
                else error("The argument not recognised: --list %s\n", optarg);
                break;
            case 't': trio_samples = optarg; break;
            case 'T': trio_file = optarg; break;
            case 'h':
            case '?':
            default: error("%s",usage()); break;
        }
    }
    if ( rules_fname )
        args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args);
    else
        args.rules = init_rules(&args, rules_string);
    if ( !args.rules ) return -1;
    args.itr     = regitr_init(args.rules);
    args.itr_ori = regitr_init(args.rules);

    char *fname = NULL;
    if ( optind>=argc || argv[optind][0]=='-' )
    {
        if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
        else error("%s",usage());
    }
    else
        fname = argv[optind];

    if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n");
    if ( !args.mode ) error("Expected one of the -c, -d or -l options\n");
    if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD;

    args.sr = bcf_sr_init();
    if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum));
    args.hdr = bcf_sr_get_header(args.sr, 0);
    args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type));
    if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno));
    bcf_hdr_write(args.out_fh, args.hdr);


    int i, n = 0;
    char **list;
    if ( trio_samples )
    {
        args.ntrios = 1;
        args.trios = (trio_t*) calloc(1,sizeof(trio_t));
        list = hts_readlist(trio_samples, 0, &n);
        if ( n!=3 ) error("Expected three sample names with -t\n");
        args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]);
        args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]);
        args.trios[0].ichild  = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]);
        for (i=0; i<n; i++) free(list[i]);
        free(list);
    }
    if ( trio_file )
    {
        list = hts_readlist(trio_file, 1, &n);
        args.ntrios = n;
        args.trios = (trio_t*) calloc(n,sizeof(trio_t));
        for (i=0; i<n; i++)
        {
            char *ss = list[i], *se;
            se = strchr(ss, ',');
            if ( !se ) error("Could not parse %s: %s\n",trio_file, ss);
            *se = 0;
            args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
            if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss);
            ss = ++se; 
            se = strchr(ss, ',');
            if ( !se ) error("Could not parse %s\n",trio_file);
            *se = 0;
            args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
            if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss);
            ss = ++se; 
            if ( *ss=='\0' ) error("Could not parse %s\n",trio_file);
            args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
            if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss);
            free(list[i]);
        }
        free(list);
    }

    while ( bcf_sr_next_line(args.sr) )
    {
        bcf1_t *line = bcf_sr_get_line(args.sr,0);
        line = process(line);
        if ( line )
        {
            if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode);
            bcf_write1(args.out_fh, args.hdr, line);
        }
    }


    fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n");
    for (i=0; i<args.ntrios; i++)
    {
        trio_t *trio = &args.trios[i];
        fprintf(stderr,"%d\t%d\t%d\t%s,%s,%s\n", 
            trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad),
            bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother),
            bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather),
            bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild)
            );
    }
    free(args.gt_arr);
    free(args.trios);
    regitr_destroy(args.itr);
    regitr_destroy(args.itr_ori);
    regidx_destroy(args.rules);
    bcf_sr_destroy(args.sr);
    if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n");
    return 0;
}
Example #29
0
static void init_data(args_t *args)
{
    args->aux.srs = bcf_sr_init();

    // Open files for input and output, initialize structures
    if ( args->targets )
    {
        if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 )
            error("Failed to read the targets: %s\n", args->targets);

        if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED )
        {
            args->aux.srs->targets->missed_reg_handler = print_missed_line;
            args->aux.srs->targets->missed_reg_data = args;
        }
    }
    if ( args->regions )
    {
        if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 )
            error("Failed to read the targets: %s\n", args->regions);
    }

    int i;
    if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open: %s\n", args->bcf_fname);

    if ( args->nsamples && args->nsamples != bcf_hdr_nsamples(args->aux.srs->readers[0].header) )
    {
        args->samples_map = (int *) malloc(sizeof(int)*args->nsamples);
        args->aux.hdr = bcf_hdr_subset(args->aux.srs->readers[0].header, args->nsamples, args->samples, args->samples_map);
        for (i=0; i<args->nsamples; i++)
            if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]);
        if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n");
    }
    else
    {
        args->aux.hdr = bcf_hdr_dup(args->aux.srs->readers[0].header);
        for (i=0; i<args->nsamples; i++)
            if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 )
                error("No such sample: %s\n", args->samples[i]);
    }

    // Reorder ploidy and family indexes to match mpileup's output and exclude samples which are not available
    if ( args->aux.ploidy )
    {
        for (i=0; i<args->aux.nfams; i++)
        {
            int j;
            for (j=0; j<3; j++)
            {
                int k = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[ args->aux.fams[i].sample[j] ]);
                if ( k<0 ) error("No such sample: %s\n", args->samples[ args->aux.fams[i].sample[j] ]);
                args->aux.fams[i].sample[j] = k;
            }
        }
        uint8_t *ploidy = (uint8_t*) calloc(bcf_hdr_nsamples(args->aux.hdr), 1);
        for (i=0; i<args->nsamples; i++)    // i index in -s sample list
        {
            int j = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[i]);     // j index in the output VCF / subset VCF
            if ( j<0 )
            {
                fprintf(stderr,"Warning: no such sample: \"%s\"\n", args->samples[i]);
                continue;
            }
            ploidy[j] = args->aux.ploidy[i];
        }
        args->nsamples = bcf_hdr_nsamples(args->aux.hdr);
        for (i=0; i<args->nsamples; i++)
            assert( ploidy[i]==0 || ploidy[i]==1 || ploidy[i]==2 );
        free(args->aux.ploidy);
        args->aux.ploidy = ploidy;
    }

    args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type));
    if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));

    if ( args->flag & CF_QCALL )
        return;

    if ( args->flag & CF_MCALL )
        mcall_init(&args->aux);

    if ( args->flag & CF_CCALL )
        ccall_init(&args->aux);

    if ( args->flag&CF_GVCF )
    {
        bcf_hdr_append(args->aux.hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">");
        args->gvcf.rid  = -1;
        args->gvcf.line = bcf_init1();
        args->gvcf.gt   = (int32_t*) malloc(2*sizeof(int32_t)*bcf_hdr_nsamples(args->aux.hdr));
        for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++)
        {
            args->gvcf.gt[2*i+0] = bcf_gt_unphased(0);
            args->gvcf.gt[2*i+1] = bcf_gt_unphased(0);
        }
    }

    bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS");
    bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16");

    bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call");
    bcf_hdr_write(args->out_fh, args->aux.hdr);

    if ( args->flag&CF_INS_MISSED ) init_missed_line(args);
}
Example #30
0
static void init_data(args_t *args)
{
    args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type));
    if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno));
    if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads);

    args->hdr = args->files->readers[0].header;
    args->flt_pass = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"PASS"); assert( !args->flt_pass );  // sanity check: required by BCF spec

    // -i or -e: append FILTER line
    if ( args->soft_filter && args->filter_logic )
    {
        kstring_t flt_name = {0,0,0};
        if ( strcmp(args->soft_filter,"+") )
            kputs(args->soft_filter, &flt_name);
        else
        {
            // Make up a filter name
            int i = 0, id = -1;
            do
            {
                ksprintf(&flt_name,"Filter%d", ++i);
                id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s);
            }
            while ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,id) );
        }
        // escape quotes
        kstring_t tmp = {0,0,0};
        char *t = args->filter_str;
        while ( *t )
        {
            if ( *t=='"' ) kputc('\\',&tmp);
            kputc(*t,&tmp);
            t++;
        }
        int ret = bcf_hdr_printf(args->hdr, "##FILTER=<ID=%s,Description=\"Set if %s: %s\">", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s);
        if ( ret!=0 )
            error("Failed to append header line: ##FILTER=<ID=%s,Description=\"Set if %s: %s\">\n", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s);
        args->flt_fail = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s); assert( args->flt_fail>=0 );
        free(flt_name.s);
        free(tmp.s);
    }

    if ( args->snp_gap || args->indel_gap )
    {
        if ( !args->filter_logic && args->soft_filter && strcmp(args->soft_filter,"+") )
        {
            kstring_t tmp = {0,0,0};
            if ( args->snp_gap ) kputs("\"SnpGap\"", &tmp);
            if ( args->indel_gap )
            {
                if ( tmp.s ) kputs(" and ", &tmp);
                kputs("\"IndelGap\"", &tmp);
            }
            fprintf(stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter);
            free(tmp.s);
        }

        rbuf_init(&args->rbuf, 64);
        args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*));
        if ( args->snp_gap )
        {
            bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of an indel\">", args->snp_gap);
            args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap");
            assert( args->SnpGap_id>=0 );
        }
        if ( args->indel_gap )
        {
            bcf_hdr_printf(args->hdr, "##FILTER=<ID=IndelGap,Description=\"Indel within %d bp of an indel\">", args->indel_gap);
            args->IndelGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "IndelGap");
            assert( args->IndelGap_id>=0 );
        }
    }

    if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter");

    if ( args->filter_str )
        args->filter = filter_init(args->hdr, args->filter_str);
}