/* Called once at startup, allows to initialize local variables. Return 1 to suppress VCF/BCF header from printing, 0 for standard VCF/BCF output and -1 on critical errors. */ int init(const char *opts, bcf_hdr_t *in, bcf_hdr_t *out) { int i, id; in_hdr = in; tags = config_get_list(opts ? opts : "tags=PL,GL,GT","tags", &ntags); for (i=0; i<ntags; i++) { if ( !strcmp("PL",tags[i]) ) { id = bcf_hdr_id2int(in_hdr,BCF_DT_ID,"PL"); if ( bcf_hdr_idinfo_exists(in_hdr,BCF_HL_FMT,id) ) { pl_type = bcf_hdr_id2type(in_hdr,BCF_HL_FMT,id); if ( pl_type!=BCF_HT_INT && pl_type!=BCF_HT_REAL ) { fprintf(stderr,"Expected numeric type of FORMAT/PL\n"); return -1; } handlers = (dosage_f*) realloc(handlers,(nhandlers+1)*sizeof(*handlers)); handlers[nhandlers++] = calc_dosage_PL; } } else if ( !strcmp("GL",tags[i]) ) { id = bcf_hdr_id2int(in_hdr,BCF_DT_ID,"GL"); if ( bcf_hdr_idinfo_exists(in_hdr,BCF_HL_FMT,id) ) { gl_type = bcf_hdr_id2type(in_hdr,BCF_HL_FMT,id); if ( gl_type!=BCF_HT_INT && gl_type!=BCF_HT_REAL ) { fprintf(stderr,"Expected numeric type of FORMAT/GL\n"); return -1; } handlers = (dosage_f*) realloc(handlers,(nhandlers+1)*sizeof(*handlers)); handlers[nhandlers++] = calc_dosage_GL; } } else if ( !strcmp("GT",tags[i]) ) { handlers = (dosage_f*) realloc(handlers,(nhandlers+1)*sizeof(*handlers)); handlers[nhandlers++] = calc_dosage_GT; } else { fprintf(stderr,"No handler for tag \"%s\"\n", tags[i]); return -1; } } free(tags[0]); free(tags); printf("#[1]CHROM\t[2]POS\t[3]REF\t[4]ALT"); for (i=0; i<bcf_hdr_nsamples(in_hdr); i++) printf("\t[%d]%s", i+5,in_hdr->samples[i]); printf("\n"); return 1; }
static int filters_init_func(filter_t *filter, int func_type, char **str, token_t *tok) { char *e = *str; while ( *e && *e!=')' ) e++; if ( !*e ) error("Could not parse the expression, right bracket not found [...%s]\n", str); kstring_t tmp = {0,0,0}; kputsn(*str, e-(*str), &tmp); (*str) += e-(*str)+1; tok->hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, tmp.s); if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FMT,tok->hdr_id) ) error("[%s:%d %s] Error: the tag \"FORMAT/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); int fmt_type = bcf_hdr_id2type(filter->hdr,BCF_HL_FMT,tok->hdr_id); if ( fmt_type!=BCF_HT_INT && fmt_type!=BCF_HT_REAL ) error("[%s:%d %s] Error: expected numeric tag with %s\n", tmp.s); switch (func_type) { case TOK_MAX: tok->setter = filters_set_format_max; break; case TOK_MIN: tok->setter = filters_set_format_min; break; case TOK_AVG: tok->setter = filters_set_format_avg; break; default: error("[%s:%d %s] Error: unknown func_type: %d\n", func_type); } tok->tok_type = TOK_VAL; tok->tag = tmp.s; return 0; }
void VariantHeaderMerger<fields_forward_LUT_ordering, fields_reverse_LUT_ordering, samples_forward_LUT_ordering, samples_reverse_LUT_ordering>:: add_header_fields_mapping(bcf_hdr_t* curr_header, unsigned input_vcf_idx) { assert(m_merged_vcf_header_ptr); for(auto j=0;j<curr_header->n[BCF_DT_ID];++j) { auto curr_id = &(curr_header->id[BCF_DT_ID][j]); for(auto bcf_hl_type : { BCF_HL_FLT, BCF_HL_INFO, BCF_HL_FMT }) { //id has been deleted - ignore if(!bcf_hdr_idinfo_exists(curr_header, bcf_hl_type, j)) continue; bcf_hrec_t* hrec = bcf_hdr_id2hrec(curr_header, BCF_DT_ID, bcf_hl_type, j); if(hrec) //not deleted { const char* key = curr_id->key; auto merged_idx = bcf_hdr_id2int(m_merged_vcf_header_ptr.get(), BCF_DT_ID, key); assert(merged_idx >= 0 && merged_idx < m_merged_vcf_header_ptr->n[BCF_DT_ID]); assert(bcf_hdr_idinfo_exists(m_merged_vcf_header_ptr, bcf_hl_type, merged_idx)); m_header_fields_LUT.add_input_merged_idx_pair(input_vcf_idx, j, merged_idx); } } } }
static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) { kstring_t str = {0,0,0}; const char *tmp = filters, *prev = filters; int nout = 0, *out = NULL; while ( 1 ) { if ( *tmp==',' || !*tmp ) { out = (int*) realloc(out, (nout+1)*sizeof(int)); if ( tmp-prev==1 && *prev=='.' ) out[nout] = -1; else { str.l = 0; kputsn(prev, tmp-prev, &str); out[nout] = bcf_hdr_id2int(hdr, BCF_DT_ID, str.s); } nout++; if ( !*tmp ) break; prev = tmp+1; } tmp++; } if ( str.m ) free(str.s); *nfilters = nout; return out; }
void VariantHeaderMerger<fields_forward_LUT_ordering, fields_reverse_LUT_ordering, samples_forward_LUT_ordering, samples_reverse_LUT_ordering>:: store_merged_field_idx_for_enum(const string& field_name, unsigned field_enum) { if(field_enum >= m_num_enums_allocated) { m_num_enums_allocated = 2u*field_enum + 1u; m_merged_field_idx_enum_lut.resize_luts_if_needed(1u, m_num_enums_allocated); } auto val = bcf_hdr_id2int(m_merged_vcf_header_ptr.get(), BCF_DT_ID, field_name.c_str()); if(val == -1) m_merged_field_idx_enum_lut.reset_merged_idx_for_input(0u, field_enum); else m_merged_field_idx_enum_lut.add_input_merged_idx_pair(0u, field_enum, val); }
static void init_data(args_t *args) { args->header = args->files->readers[0].header; int i, nsamples = 0, *samples = NULL; if ( args->sample_list && strcmp("-",args->sample_list) ) { for (i=0; i<args->files->nreaders; i++) { int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file); if ( ret<0 ) error("Error parsing the sample list\n"); else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret); } if ( args->sample_list[0]!='^' ) { // the sample ordering may be different if not negated int n; char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); if ( !smpls ) error("Could not parse %s\n", args->sample_list); if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) ) error("The number of samples does not match, perhaps some are present multiple times?\n"); nsamples = bcf_hdr_nsamples(args->files->readers[0].header); samples = (int*) malloc(sizeof(int)*nsamples); for (i=0; i<n; i++) { samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]); free(smpls[i]); } free(smpls); } } args->convert = convert_init(args->header, samples, nsamples, args->format_str); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); free(samples); int max_unpack = convert_max_unpack(args->convert); if ( args->filter_str ) { args->filter = filter_init(args->header, args->filter_str); max_unpack |= filter_max_unpack(args->filter); } args->files->max_unpack = max_unpack; }
void bcf_hdr_merge(bcf_hdr_t *hw, const bcf_hdr_t *_hr, const char *clash_prefix) { bcf_hdr_t *hr = (bcf_hdr_t*)_hr; // header lines int i, nw_ori = hw->nhrec; for (i=0; i<hr->nhrec; i++) { if ( hr->hrec[i]->type==BCF_HL_GEN && hr->hrec[i]->value ) { int j; for (j=0; j<nw_ori; j++) { if ( hw->hrec[j]->type!=BCF_HL_GEN ) continue; if ( !strcmp(hr->hrec[i]->key,hw->hrec[j]->key) && !strcmp(hr->hrec[i]->value,hw->hrec[j]->value) ) break; } if ( j>=nw_ori ) bcf_hdr_add_hrec(hw, bcf_hrec_dup(hr->hrec[i])); } else { bcf_hrec_t *rec = bcf_hdr_get_hrec(hw, hr->hrec[i]->type, hr->hrec[i]->vals[0]); if ( !rec ) bcf_hdr_add_hrec(hw, bcf_hrec_dup(hr->hrec[i])); } } // samples for (i=0; i<bcf_hdr_nsamples(hr); i++) { char *name = hr->samples[i]; if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) { // there is a sample with the same name int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1; name = (char*) malloc(sizeof(char)*(len+1)); sprintf(name,"%s:%s",clash_prefix,hr->samples[i]); bcf_hdr_add_sample(hw,name); free(name); } else bcf_hdr_add_sample(hw,name); } }
void VariantHeaderMerger<fields_forward_LUT_ordering, fields_reverse_LUT_ordering, samples_forward_LUT_ordering, samples_reverse_LUT_ordering>:: add_samples_mapping(bcf_hdr_t* curr_header, unsigned input_vcf_idx) { for(auto j=0;j<bcf_hdr_nsamples(curr_header);++j) { if(curr_header->samples[j] && bcf_hdr_id2int(curr_header, BCF_DT_SAMPLE, curr_header->samples[j]) >= 0) { if(m_sample2idx_merged.find(curr_header->samples[j]) == m_sample2idx_merged.end()) { auto curr_size = m_sample2idx_merged.size(); m_sample2idx_merged[curr_header->samples[j]] = curr_size; } m_samples_LUT.add_input_merged_idx_pair(input_vcf_idx, j, m_sample2idx_merged[curr_header->samples[j]]); } else m_samples_LUT.reset_merged_idx_for_input(input_vcf_idx, j); } }
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { memset(&args,0,sizeof(args_t)); args.in_hdr = in; args.out_hdr = out; static struct option loptions[] = { {"tags",1,0,'t'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:T:l:cd",loptions,NULL)) >= 0) { switch (c) { case 't': args.tags |= parse_tags(&args,optarg); break; case 'h': case '?': default: error("%s", usage()); break; } } if ( optind != argc ) error(usage()); if ( !args.tags ) args.tags |= SET_AN|SET_AC|SET_NS|SET_AC_Hom|SET_AC_Het|SET_AC_Hemi|SET_AF; args.gt_id = bcf_hdr_id2int(args.in_hdr,BCF_DT_ID,"GT"); if ( args.gt_id<0 ) error("Error: GT field is not present\n"); if ( args.tags&SET_AN ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">"); if ( args.tags&SET_AC ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">"); if ( args.tags&SET_NS ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">"); if ( args.tags&SET_AC_Hom ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Hom,Number=A,Type=Integer,Description=\"Allele counts in homozygous genotypes\">"); if ( args.tags&SET_AC_Het ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Het,Number=A,Type=Integer,Description=\"Allele counts in heterozygous genotypes\">"); if ( args.tags&SET_AC_Hemi ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Hemi,Number=A,Type=Integer,Description=\"Allele counts in hemizygous genotypes\">"); if ( args.tags&SET_AF ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele frequency\">"); return 0; }
void merge_filter(args_t *args, bcf1_t *out) { bcf_srs_t *files = args->files; bcf_hdr_t *out_hdr = args->out_hdr; int i, ret; khiter_t kitr; strdict_t *tmph = args->tmph; kh_clear(strdict, tmph); maux_t *ma = args->maux; out->d.n_flt = 0; for (i=0; i<files->nreaders; i++) { if ( !ma->has_line[i]) continue; bcf_sr_t *reader = &files->readers[i]; bcf1_t *line = reader->buffer[0]; bcf_hdr_t *hdr = reader->header; bcf_unpack(line, BCF_UN_ALL); int k; for (k=0; k<line->d.n_flt; k++) { const char *flt = hdr->id[BCF_DT_ID][line->d.flt[k]].key; kitr = kh_get(strdict, tmph, flt); if ( kitr == kh_end(tmph) ) { int id = bcf_hdr_id2int(out_hdr, BCF_DT_ID, flt); if ( id==-1 ) error("The filter not defined: %s\n", flt); hts_expand(int,out->d.n_flt+1,ma->mflt,ma->flt); ma->flt[out->d.n_flt] = id; out->d.n_flt++; kh_put(strdict, tmph, flt, &ret); } } }
static void init_data(args_t *args) { args->files = bcf_sr_init(); args->files->require_index = 1; if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to open %s: %s\n", args->fname, bcf_sr_strerror(args->files->errnum)); args->hdr = args->files->readers[0].header; args->isample = -1; if ( args->sample ) { args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample); if ( args->isample<0 ) error("No such sample: %s\n", args->sample); } if ( args->haplotype && args->isample<0 ) { if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n"); args->isample = 0; } if ( args->mask_fname ) { args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL); if ( !args->mask ) error("Failed to initialize mask regions\n"); } // In case we want to store the chains if ( args->chain_fname ) { args->fp_chain = fopen(args->chain_fname,"w"); if ( ! args->fp_chain ) error("Failed to create %s: %s\n", args->chain_fname, strerror(errno)); args->chain_id = 0; } rbuf_init(&args->vcf_rbuf, 100); args->vcf_buf = (bcf1_t**) calloc(args->vcf_rbuf.m, sizeof(bcf1_t*)); if ( args->output_fname ) { args->fp_out = fopen(args->output_fname,"w"); if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno)); } else args->fp_out = stdout; }
static void check_gt(args_t *args) { int i,ret, *gt2ipl = NULL, m_gt2ipl = 0, *gt_arr = NULL, ngt_arr = 0; int fake_pls = args->no_PLs; // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->gt_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header of %s?\n", __func__, args->files->readers[1].fname); if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); int tgt_isample = -1, query_isample = 0; if ( args->target_sample ) { tgt_isample = bcf_hdr_id2int(args->gt_hdr, BCF_DT_SAMPLE, args->target_sample); if ( tgt_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[1].fname, args->target_sample); } if ( args->all_sites ) { if ( tgt_isample==-1 ) { fprintf(stderr,"No target sample selected for comparison, using the first sample in %s: %s\n", args->gt_fname,args->gt_hdr->samples[0]); tgt_isample = 0; } } if ( args->query_sample ) { query_isample = bcf_hdr_id2int(args->sm_hdr, BCF_DT_SAMPLE, args->query_sample); if ( query_isample<0 ) error("No such sample in %s: [%s]\n", args->files->readers[0].fname, args->query_sample); } if ( args->all_sites ) fprintf(fp, "# [1]SC, Site by Site Comparison\t[2]Chromosome\t[3]Position\t[4]-g alleles\t[5]-g GT (%s)\t[6]Coverage\t[7]Query alleles\t[8-]Query PLs (%s)\n", args->gt_hdr->samples[tgt_isample],args->sm_hdr->samples[query_isample]); // Main loop while ( (ret=bcf_sr_next_line(args->files)) ) { if ( ret!=2 ) continue; bcf1_t *sm_line = args->files->readers[0].buffer[0]; // the query file bcf1_t *gt_line = args->files->readers[1].buffer[0]; // the -g target file bcf_unpack(sm_line, BCF_UN_FMT); bcf_unpack(gt_line, BCF_UN_FMT); // Init mapping from target genotype index to the sample's PL fields int n_gt2ipl = gt_line->n_allele*(gt_line->n_allele + 1)/2; if ( n_gt2ipl > m_gt2ipl ) { m_gt2ipl = n_gt2ipl; gt2ipl = (int*) realloc(gt2ipl, sizeof(int)*m_gt2ipl); } if ( !init_gt2ipl(args, gt_line, sm_line, gt2ipl, n_gt2ipl) ) continue; // Target genotypes int ngt, npl; if ( (ngt=bcf_get_genotypes(args->gt_hdr, gt_line, >_arr, &ngt_arr)) <= 0 ) error("GT not present at %s:%d?", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); ngt /= bcf_hdr_nsamples(args->gt_hdr); if ( ngt!=2 ) continue; // checking only diploid genotypes // Sample PLs if ( !fake_pls ) { if ( (npl=bcf_get_format_int32(args->sm_hdr, sm_line, "PL", &args->pl_arr, &args->npl_arr)) <= 0 ) error("PL not present at %s:%d?", args->sm_hdr->id[BCF_DT_CTG][sm_line->rid].key, sm_line->pos+1); npl /= bcf_hdr_nsamples(args->sm_hdr); } else npl = fake_PLs(args, args->sm_hdr, sm_line); // Calculate likelihoods for all samples, assuming diploid genotypes // For faster access to genotype likelihoods (PLs) of the query sample int max_ipl, *pl_ptr = args->pl_arr + query_isample*npl; double sum_pl = 0; // for converting PLs to probs for (max_ipl=0; max_ipl<npl; max_ipl++) { if ( pl_ptr[max_ipl]==bcf_int32_vector_end ) break; if ( pl_ptr[max_ipl]==bcf_int32_missing ) continue; sum_pl += pow(10, -0.1*pl_ptr[max_ipl]); } if ( sum_pl==0 ) continue; // no PLs present // The main stats: concordance of the query sample with the target -g samples for (i=0; i<bcf_hdr_nsamples(args->gt_hdr); i++) { int *gt_ptr = gt_arr + i*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( a<0 || b<0 ) continue; // missing genotypes if ( args->hom_only && a!=b ) continue; // heterozygous genotype int igt_tgt = igt_tgt = bcf_alleles2gt(a,b); // genotype index in the target file int igt_qry = gt2ipl[igt_tgt]; // corresponding genotype in query file if ( igt_qry>=max_ipl || pl_ptr[igt_qry]<0 ) continue; // genotype not present in query sample: haploid or missing args->lks[i] += log(pow(10, -0.1*pl_ptr[igt_qry])/sum_pl); args->sites[i]++; } if ( args->all_sites ) { // Print LKs at all sites for debugging int *gt_ptr = gt_arr + tgt_isample*ngt; if ( gt_ptr[1]==bcf_int32_vector_end ) continue; // skip haploid genotypes int a = bcf_gt_allele(gt_ptr[0]); int b = bcf_gt_allele(gt_ptr[1]); if ( args->hom_only && a!=b ) continue; // heterozygous genotype fprintf(fp, "SC\t%s\t%d", args->gt_hdr->id[BCF_DT_CTG][gt_line->rid].key, gt_line->pos+1); for (i=0; i<gt_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', gt_line->d.allele[i]); fprintf(fp, "\t%s/%s", a>=0 ? gt_line->d.allele[a] : ".", b>=0 ? gt_line->d.allele[b] : "."); int igt, *pl_ptr = args->pl_arr + query_isample*npl; // PLs of the query sample for (i=0; i<sm_line->n_allele; i++) fprintf(fp, "%c%s", i==0?'\t':',', sm_line->d.allele[i]); for (igt=0; igt<npl; igt++) if ( pl_ptr[igt]==bcf_int32_vector_end ) break; else if ( pl_ptr[igt]==bcf_int32_missing ) fprintf(fp, "."); else fprintf(fp, "\t%d", pl_ptr[igt]); fprintf(fp, "\n"); } } free(gt2ipl); free(gt_arr); free(args->pl_arr); free(args->tmp_arr); // Scale LKs and certainties int nsamples = bcf_hdr_nsamples(args->gt_hdr); double min = args->lks[0]; for (i=1; i<nsamples; i++) if ( min>args->lks[i] ) min = args->lks[i]; for (i=0; i<nsamples; i++) args->lks[i] = min ? args->lks[i] / min : 0; double max_avg = args->sites[0] ? args->lks[0]/args->sites[0] : 0; for (i=1; i<nsamples; i++) { double val = args->sites[i] ? args->lks[i]/args->sites[i] : 0; if ( max_avg<val ) max_avg = val; } // Sorted output double **p = (double**) malloc(sizeof(double*)*nsamples); for (i=0; i<nsamples; i++) p[i] = &args->lks[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); fprintf(fp, "# [1]CN\t[2]Concordance with %s (total)\t[3]Concordance (average)\t[4]Number of sites compared\t[5]Sample\t[6]Sample ID\n", args->sm_hdr->samples[query_isample]); for (i=0; i<nsamples; i++) { int idx = p[i] - args->lks; double avg = args->sites[idx] ? args->lks[idx]/args->sites[idx] : 0; fprintf(fp, "CN\t%e\t%e\t%.0f\t%s\t%d\n", 1-args->lks[idx], 1-avg/max_avg, args->sites[idx], args->gt_hdr->samples[idx], i); } if ( args->plot ) { fclose(fp); plot_check(args, args->target_sample ? args->target_sample : "", args->sm_hdr->samples[query_isample]); } }
static void cross_check_gts(args_t *args) { int nsamples = bcf_hdr_nsamples(args->sm_hdr), ndp_arr = 0; unsigned int *dp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)), *ndp = (unsigned int*) calloc(nsamples,sizeof(unsigned int)); // this will overflow one day... int fake_pls = args->no_PLs, ignore_dp = 0; int i,j,k,idx, pl_warned = 0, dp_warned = 0; int32_t *dp_arr = NULL; int *is_hom = args->hom_only ? (int*) malloc(sizeof(int)*nsamples) : NULL; if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); fake_pls = 1; } if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "DP")<0 ) ignore_dp = 1; FILE *fp = args->plot ? open_file(NULL, "w", "%s.tab", args->plot) : stdout; print_header(args, fp); if ( args->all_sites ) fprintf(fp,"# [1]SD, Average Site Discordance\t[2]Chromosome\t[3]Position\t[4]Number of available pairs\t[5]Average discordance\n"); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = args->files->readers[0].buffer[0]; bcf_unpack(line, BCF_UN_FMT); int npl; if ( !fake_pls ) { npl = bcf_get_format_int32(args->sm_hdr, line, "PL", &args->pl_arr, &args->npl_arr); if ( npl<=0 ) { pl_warned++; continue; } npl /= nsamples; } else npl = fake_PLs(args, args->sm_hdr, line); if ( !ignore_dp && bcf_get_format_int32(args->sm_hdr, line, "DP", &dp_arr, &ndp_arr) <= 0 ) { dp_warned++; continue; } if ( args->hom_only ) { for (i=0; i<nsamples; i++) is_hom[i] = is_hom_most_likely(line->n_allele, args->pl_arr+i*npl); } double sum = 0; int nsum = 0; idx = 0; for (i=0; i<nsamples; i++) { int *ipl = &args->pl_arr[i*npl]; if ( *ipl==-1 ) { idx += i; continue; } // missing genotype if ( !ignore_dp && (dp_arr[i]==bcf_int32_missing || !dp_arr[i]) ) { idx += i; continue; } if ( args->hom_only && !is_hom[i] ) { idx += i; continue; } for (j=0; j<i; j++) { int *jpl = &args->pl_arr[j*npl]; if ( *jpl==-1 ) { idx++; continue; } // missing genotype if ( !ignore_dp && (dp_arr[j]==bcf_int32_missing || !dp_arr[j]) ) { idx++; continue; } if ( args->hom_only && !is_hom[j] ) { idx++; continue; } int min_pl = INT_MAX; for (k=0; k<npl; k++) { if ( ipl[k]==bcf_int32_missing || jpl[k]==bcf_int32_missing ) break; if ( ipl[k]==bcf_int32_vector_end || jpl[k]==bcf_int32_vector_end ) { k = npl; break; } if ( min_pl > ipl[k]+jpl[k] ) min_pl = ipl[k]+jpl[k]; } if ( k!=npl ) { idx++; continue; } if ( args->all_sites ) { sum += min_pl; nsum++; } args->lks[idx] += min_pl; args->cnts[idx]++; if ( !ignore_dp ) { args->dps[idx] += dp_arr[i] < dp_arr[j] ? dp_arr[i] : dp_arr[j]; dp[i] += dp_arr[i]; ndp[i]++; dp[j] += dp_arr[j]; ndp[j]++; } else { args->dps[idx]++; dp[i]++; ndp[i]++; dp[j]++; ndp[j]++; } idx++; } } if ( args->all_sites ) fprintf(fp,"SD\t%s\t%d\t%d\t%.0f\n", args->sm_hdr->id[BCF_DT_CTG][line->rid].key, line->pos+1, nsum, nsum?sum/nsum:0); } if ( dp_arr ) free(dp_arr); if ( args->pl_arr ) free(args->pl_arr); if ( args->tmp_arr ) free(args->tmp_arr); if ( is_hom ) free(is_hom); if ( pl_warned ) fprintf(stderr, "[W::%s] PL was not found at %d site(s)\n", __func__, pl_warned); if ( dp_warned ) fprintf(stderr, "[W::%s] DP was not found at %d site(s)\n", __func__, dp_warned); // Output samples sorted by average discordance double *score = (double*) calloc(nsamples,sizeof(double)); args->sites = (double*) calloc(nsamples,sizeof(double)); idx = 0; for (i=0; i<nsamples; i++) { for (j=0; j<i; j++) { score[i] += args->lks[idx]; score[j] += args->lks[idx]; args->sites[i] += args->cnts[idx]; args->sites[j] += args->cnts[idx]; idx++; } } for (i=0; i<nsamples; i++) if ( args->sites[i] ) score[i] /= args->sites[i]; double **p = (double**) malloc(sizeof(double*)*nsamples), avg_score = 0; for (i=0; i<nsamples; i++) p[i] = &score[i]; qsort(p, nsamples, sizeof(int*), cmp_doubleptr); // The average discordance gives the number of differing sites in % with -G1 fprintf(fp, "# [1]SM\t[2]Average Discordance\t[3]Average depth\t[4]Average number of sites\t[5]Sample\t[6]Sample ID\n"); for (i=0; i<nsamples; i++) { idx = p[i] - score; double adp = ndp[idx] ? (double)dp[idx]/ndp[idx] : 0; double nsites = args->sites[idx]/(nsamples-1); avg_score += score[idx]; fprintf(fp, "SM\t%f\t%.2lf\t%.0lf\t%s\t%d\n", score[idx]*100., adp, nsites, args->sm_hdr->samples[idx],i); } // Overall score: maximum absolute deviation from the average score fprintf(fp, "# [1] MD\t[2]Maximum deviation\t[3]The culprit\n"); fprintf(fp, "MD\t%f\t%s\n", (score[idx] - avg_score/nsamples)*100., args->sm_hdr->samples[idx]); // idx still set free(p); free(score); free(dp); free(ndp); // Pairwise discordances fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); idx = 0; for (i=0; i<nsamples; i++) { for (j=0; j<i; j++) { fprintf(fp, "CN\t%.0f\t%d\t%.2f\t%s\t%s\n", args->lks[idx], args->cnts[idx], args->cnts[idx]?(double)args->dps[idx]/args->cnts[idx]:0.0, args->sm_hdr->samples[i],args->sm_hdr->samples[j]); idx++; } } fclose(fp); if ( args->plot ) plot_cross_check(args); }
int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) { int i; for (i=0; i<line->n_allele; i++) ac[i]=0; // Use INFO/AC,AN field only when asked if ( which&BCF_UN_INFO ) { bcf_unpack(line, BCF_UN_INFO); int an_id = bcf_hdr_id2int(header, BCF_DT_ID, "AN"); int ac_id = bcf_hdr_id2int(header, BCF_DT_ID, "AC"); int i, an=-1, ac_len=0, ac_type=0; uint8_t *ac_ptr=NULL; if ( an_id>=0 && ac_id>=0 ) { for (i=0; i<line->n_info; i++) { bcf_info_t *z = &line->d.info[i]; if ( z->key == an_id ) an = z->v1.i; else if ( z->key == ac_id ) { ac_ptr = z->vptr; ac_len = z->len; ac_type = z->type; } } } if ( an>=0 && ac_ptr ) { int nac = 0; #define BRANCH_INT(type_t) { \ type_t *p = (type_t *) ac_ptr; \ for (i=0; i<ac_len; i++) \ { \ ac[i+1] = p[i]; \ nac += p[i]; \ } \ } switch (ac_type) { case BCF_BT_INT8: BRANCH_INT(int8_t); break; case BCF_BT_INT16: BRANCH_INT(int16_t); break; case BCF_BT_INT32: BRANCH_INT(int32_t); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT assert( an>=nac ); // sanity check for missing values ac[0] = an - nac; return 1; } } // Split genotype fields only when asked if ( which&BCF_UN_FMT ) { int i, gt_id = bcf_hdr_id2int(header,BCF_DT_ID,"GT"); if ( gt_id<0 ) return 0; bcf_unpack(line, BCF_UN_FMT); bcf_fmt_t *fmt_gt = NULL; for (i=0; i<(int)line->n_fmt; i++) if ( line->d.fmt[i].id==gt_id ) { fmt_gt = &line->d.fmt[i]; break; } if ( !fmt_gt ) return 0; #define BRANCH_INT(type_t,missing,vector_end) { \ for (i=0; i<line->n_sample; i++) \ { \ type_t *p = (type_t*) (fmt_gt->p + i*fmt_gt->size); \ int ial; \ for (ial=0; ial<fmt_gt->n; ial++) \ { \ if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( !(p[ial]>>1) || p[ial]==missing ) continue; /* missing allele */ \ ac[(p[ial]>>1)-1]++; \ } \ } \ } switch (fmt_gt->type) { case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; default: fprintf(stderr, "[E::%s] todo: %d at %s:%d\n", __func__, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT return 1; } return 0; }
int main(int argc, char **argv) { char *fname = argc>1 ? argv[1] : "/dev/null"; htsFile *fp = hts_open(fname, "w"); bcf_hdr_t *hdr1, *hdr2; hdr1 = bcf_hdr_init("w"); hdr2 = bcf_hdr_init("w"); // Add two shared and two private annotations bcf_hdr_append(hdr1, "##contig=<ID=1>"); bcf_hdr_append(hdr1, "##contig=<ID=2>"); bcf_hdr_append(hdr2, "##contig=<ID=2>"); bcf_hdr_append(hdr2, "##contig=<ID=1>"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT1,Description=\"Filter 1\">"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT2,Description=\"Filter 2\">"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT3,Description=\"Filter 3\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT4,Description=\"Filter 4\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT3,Description=\"Filter 3\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT2,Description=\"Filter 2\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF1,Number=.,Type=Integer,Description=\"Info 1\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF4,Number=.,Type=Integer,Description=\"Info 4\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT1,Number=.,Type=Integer,Description=\"FMT 1\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT4,Number=.,Type=Integer,Description=\"FMT 4\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">"); bcf_hdr_add_sample(hdr1,"SMPL1"); bcf_hdr_add_sample(hdr1,"SMPL2"); bcf_hdr_add_sample(hdr2,"SMPL1"); bcf_hdr_add_sample(hdr2,"SMPL2"); bcf_hdr_sync(hdr1); bcf_hdr_sync(hdr2); hdr2 = bcf_hdr_merge(hdr2,hdr1); bcf_hdr_sync(hdr2); if ( bcf_hdr_write(fp, hdr2)!=0 ) error("Failed to write to %s\n", fname); bcf1_t *rec = bcf_init1(); rec->rid = bcf_hdr_name2id(hdr1, "1"); rec->pos = 0; bcf_update_alleles_str(hdr1, rec, "G,A"); int32_t tmpi[3]; tmpi[0] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT1"); tmpi[1] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"); tmpi[2] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT3"); bcf_update_filter(hdr1, rec, tmpi, 3); tmpi[0] = 1; bcf_update_info_int32(hdr1, rec, "INF1", tmpi, 1); tmpi[0] = 2; bcf_update_info_int32(hdr1, rec, "INF2", tmpi, 1); tmpi[0] = 3; bcf_update_info_int32(hdr1, rec, "INF3", tmpi, 1); tmpi[0] = tmpi[1] = 1; bcf_update_format_int32(hdr1, rec, "FMT1", tmpi, 2); tmpi[0] = tmpi[1] = 2; bcf_update_format_int32(hdr1, rec, "FMT2", tmpi, 2); tmpi[0] = tmpi[1] = 3; bcf_update_format_int32(hdr1, rec, "FMT3", tmpi, 2); bcf_remove_filter(hdr1, rec, bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"), 0); bcf_update_info_int32(hdr1, rec, "INF2", NULL, 0); bcf_update_format_int32(hdr1, rec, "FMT2", NULL, 0); bcf_translate(hdr2, hdr1, rec); if ( bcf_write(fp, hdr2, rec)!=0 ) error("Faild to write to %s\n", fname); // Clean bcf_destroy1(rec); bcf_hdr_destroy(hdr1); bcf_hdr_destroy(hdr2); int ret; if ( (ret=hts_close(fp)) ) { fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); exit(ret); } return 0; }
static void init_data(args_t *args) { bcf_srs_t *files = bcf_sr_init(); if ( args->regions_list ) { if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum)); bcf_hdr_t *hdr = files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = hdr->samples[0]; } else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample); int ret = bcf_hdr_set_samples(hdr, args->sample, 0); if ( ret<0 ) error("Error setting the sample: %s\n", args->sample); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) ) error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname); int i; args->xvals = (double*) calloc(args->nbins,sizeof(double)); for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1); // collect BAF distributions for all chromosomes int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1; float *baf = NULL; while ( bcf_sr_next_line(files) ) { ntotal++; bcf1_t *line = bcf_sr_get_line(files,0); if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue; if ( bcf_float_is_missing(baf[0]) ) continue; nprocessed++; if ( prev_chr==-1 || prev_chr!=line->rid ) { // new chromosome idist = args->ndist++; args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist); memset(&args->dist[idist],0,sizeof(dist_t)); args->dist[idist].chr = strdup(bcf_seqname(hdr,line)); args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double)); args->dist[idist].xvals = args->xvals; args->dist[idist].nvals = args->nbins; prev_chr = line->rid; } int bin = baf[0]*(args->nbins-1); args->dist[idist].yvals[bin]++; // the distribution } free(baf); bcf_sr_destroy(files); for (idist=0; idist<args->ndist; idist++) { #if 0 int j; for (j=0; j<args->nbins; j++) { double x = args->dist[idist].xvals[j]; args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3); } #endif init_dist(args, &args->dist[idist],args->verbose); } args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir); fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version()); fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]); for (i=1; i<args->argc; i++) fprintf(args->dat_fp, " %s",args->argv[i]); fprintf(args->dat_fp,"\n#\n"); fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n"); fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n"); fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n"); char *fname = NULL; FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir); //-------- matplotlib script -------------- fprintf(fp, "#!/usr/bin/env python\n" "#\n" "import matplotlib as mpl\n" "mpl.use('Agg')\n" "import matplotlib.pyplot as plt\n" "import csv,sys,argparse\n" "from math import exp\n" "\n" "outdir = '%s'\n" "\n" "def read_dat(dat,fit,cn):\n" " csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n" " with open(outdir+'/dist.dat', 'rb') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0][0]=='#': continue\n" " type = row[0]\n" " chr = row[1]\n" " if type=='DIST':\n" " if chr not in dat: dat[chr] = []\n" " dat[chr].append(row)\n" " elif type=='FIT':\n" " if chr not in fit: fit[chr] = []\n" " fit[chr].append(row)\n" " elif type=='CN':\n" " cn[chr] = row[2]\n" "\n" "def plot_dist(dat,fit,chr):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n" " if chr in fit:\n" " for i in range(len(fit[chr])):\n" " pfit = fit[chr][i]\n" " exec('def xfit(x): return '+pfit[5])\n" " istart = int(pfit[3])\n" " iend = int(pfit[4])+1\n" " vals = dat[chr][istart:iend]\n" " args = {}\n" " if i==0: args = {'label':'Target to Fit'}\n" " ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n" " if i==0: args = {'label':'Best Fit'}\n" " ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n" " ax.set_title('BAF distribution, chr'+chr)\n" " ax.set_xlabel('BAF')\n" " ax.set_ylabel('Frequency')\n" " ax.legend(loc='best',prop={'size':7},frameon=False)\n" " plt.savefig(outdir+'/dist.chr'+chr+'.png')\n" " plt.close()\n" "\n" "def plot_copy_number(cn):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " xlabels = sorted(cn.keys())\n" " xvals = range(len(xlabels))\n" " yvals = [float(cn[x]) for x in xlabels]\n" " ax.plot(xvals,yvals,'o',color='red')\n" " for i in range(len(xvals)):\n" " if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n" " ax.tick_params(axis='both', which='major', labelsize=9)\n" " ax.set_xticks(xvals)\n" " ax.set_xticklabels(xlabels,rotation=45)\n" " ax.set_xlim(-1,len(xlabels))\n" " ax.set_ylim(0,5.0)\n" " ax.set_yticks([1.0,2.0,3.0,4.0])\n" " ax.set_xlabel('Chromosome')\n" " ax.set_ylabel('Copy Number')\n" " plt.savefig(outdir+'/copy-number.png')\n" " plt.close()\n" "\n" "class myParser(argparse.ArgumentParser):\n" " def error(self, message):\n" " self.print_help()\n" " sys.stderr.write('error: %%s\\n' %% message)\n" " sys.exit(2)\n" "\n" "def main():\n" " parser = myParser()\n" " parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n" " parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n" " parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n" " args = parser.parse_args()\n" " dat = {}; fit = {}; cn = {}\n" " read_dat(dat,fit,cn)\n" " if args.distrib!=None:\n" " plot_dist(dat,fit,args.distrib)\n" " if args.all:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" " plot_copy_number(cn)\n" " elif args.copy_number:\n" " plot_copy_number(cn)\n" " else:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" "\n" "if __name__ == '__main__':\n" " main()\n", args->output_dir); //--------------------------------------- chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH); free(fname); fclose(fp); }
void BlockQuantify::count() { _impl->fasta_to_use.reset(new FastaFile(_impl->ref_fasta)); #ifdef DEBUG_BLOCKQUANTIFY int lastpos = 0; std::cerr << "starting block." << "\n"; #endif auto current_bs_start = _impl->variants.begin(); std::string current_chr; int current_bs = -1; bool current_bs_valid = false; // function to compute the QQ values for truth variants in the current // benchmarking superlocus const auto update_bs_filters = [this, ¤t_bs_start](BlockQuantifyImpl::variantlist_t::iterator to) { std::set<int> bs_filters; for(auto cur = current_bs_start; cur != to; ++cur) { for(int nf = 0; nf < (*cur)->d.n_flt; ++nf) { const int f = (*cur)->d.flt[nf]; if(f != bcf_hdr_id2int(_impl->hdr, BCF_DT_ID, "PASS")) { bs_filters.insert(f); } } } if(bs_filters.empty()) { return; } for(auto cur = current_bs_start; cur != to; ++cur) { const std::string bdt = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 0); const std::string bvq = bcfhelpers::getFormatString(_impl->hdr, *cur, "BVT", 1); // filter TPs where the query call in NOCALL if(bdt == "TP" && bvq == "NOCALL") { for(auto f : bs_filters) { bcf_add_filter(_impl->hdr, *cur, f); } } } }; // function to compute the QQ values for truth variants in the current // benchmarking superlocus const auto update_bs_qq = [this, ¤t_bs_start](BlockQuantifyImpl::variantlist_t::iterator to) { std::vector<float> tp_qqs; for(auto cur = current_bs_start; cur != to; ++cur) { const float qqq = bcfhelpers::getFormatFloat(_impl->hdr, *cur, "QQ", 1); if(std::isnan(qqq)) { continue; } const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 1); // we want the scores of all TPs in this BS if(bd == "TP") { tp_qqs.push_back(qqq); } } float t_qq = bcfhelpers::missing_float(); if(!tp_qqs.empty()) { t_qq = *(std::min_element(tp_qqs.begin(), tp_qqs.end())); } /** compute the median over all variants */ int fsize = bcf_hdr_nsamples(_impl->hdr); float * fmt = (float*)calloc((size_t) fsize, sizeof(float)); for(auto cur = current_bs_start; cur != to; ++cur) { const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 0); bcf_get_format_float(_impl->hdr, *cur, "QQ", &fmt, &fsize); if(bd != "TP") { fmt[0] = bcfhelpers::missing_float(); } else { const float qqq = bcfhelpers::getFormatFloat(_impl->hdr, *cur, "QQ", 1); const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 1); if(bd == "TP" && !std::isnan(qqq)) { fmt[0] = qqq; } else { fmt[0] = t_qq; } } bcf_update_format_float(_impl->hdr, *cur, "QQ", fmt, fsize); } free(fmt); #ifdef DEBUG_BLOCKQUANTIFY const int bs = bcfhelpers::getInfoInt(_impl->hdr, *current_bs_start, "BS", -1); std::string values; for(float x : tp_qqs) { values += std::to_string(x) + ","; } std::cerr << "BS: " << bs << " T_QQ = " << t_qq << " [" << values << "]" << "\n"; #endif }; const auto update_bs_conf_boundary_flag = [this, ¤t_bs_start](BlockQuantifyImpl::variantlist_t::iterator to) { static const int has_conf = 1; static const int has_non_conf = 2; int conf_non_conf = 0; for(auto cur = current_bs_start; cur != to; ++cur) { const std::string regions = bcfhelpers::getInfoString(_impl->hdr, *cur, "Regions", ""); if(regions.find("CONF") == std::string::npos) { conf_non_conf |= has_non_conf; } else { conf_non_conf |= has_conf; } if(regions.find("TS_boundary") != std::string::npos) { conf_non_conf |= has_non_conf | has_conf; } } for(auto cur = current_bs_start; cur != to; ++cur) { const std::string regions = bcfhelpers::getInfoString(_impl->hdr, *cur, "Regions", ""); if(conf_non_conf == (has_conf | has_non_conf)) { if(regions.find("TS_boundary") == std::string::npos) { bcf_update_info_string(_impl->hdr, *cur, "Regions", (regions.empty() ? "TS_boundary" : regions + ",TS_boundary").c_str()); } } else if(conf_non_conf == has_conf) { if(regions.find("TS_contained") == std::string::npos) { // also flag fully confident superloci bcf_update_info_string(_impl->hdr, *cur, "Regions", (regions.empty() ? "TS_contained" : regions + ",TS_contained").c_str()); } } } }; for(auto v_it = _impl->variants.begin(); v_it != _impl->variants.end(); ++v_it) { // update fields, must output GA4GH-compliant fields countVariants(*v_it); // determine benchmarking superlocus const std::string vchr = bcfhelpers::getChrom(_impl->hdr, *v_it); const int vbs = bcfhelpers::getInfoInt(_impl->hdr, *v_it, "BS"); if(!current_bs_valid) { current_bs = vbs; current_chr = vchr; current_bs_valid = true; } #ifdef DEBUG_BLOCKQUANTIFY std::cerr << "current BS = " << current_bs << " vbs = " << vbs << "\n"; #endif if( current_bs_start != v_it && (vbs != current_bs || vbs < 0 || vchr != current_chr)) { #ifdef DEBUG_BLOCKQUANTIFY std::cerr << "finishing BS = " << current_bs << " vbs = " << vbs << "\n"; #endif update_bs_qq(v_it); update_bs_filters(v_it); update_bs_conf_boundary_flag(v_it); current_bs = vbs; current_chr = vchr; current_bs_start = v_it; } } // do final superlocus (if any) update_bs_qq(_impl->variants.end()); update_bs_filters(_impl->variants.end()); update_bs_conf_boundary_flag(_impl->variants.end()); for(auto & v : _impl->variants) { #ifdef DEBUG_BLOCKQUANTIFY lastpos = v->pos; #endif // use BD and BVT to make ROCs rocEvaluate(v); } #ifdef DEBUG_BLOCKQUANTIFY std::cerr << "finished block " << lastpos << " - " << _impl->variants.size() << " records on thread " << std::this_thread::get_id() << "\n"; #endif _impl->fasta_to_use.reset(nullptr); }
static void init_data(args_t *args) { args->prev_rid = args->skip_rid = -1; args->hdr = args->files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = strdup(args->hdr->samples[0]); } if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n"); // Set samples kstring_t str = {0,0,0}; if ( args->estimate_AF && strcmp("-",args->estimate_AF) ) { int i, n; char **smpls = hts_readlist(args->estimate_AF, 1, &n); // Make sure the query sample is included for (i=0; i<n; i++) if ( !strcmp(args->sample,smpls[i]) ) break; // Add the query sample if not present if ( i!=n ) kputs(args->sample, &str); for (i=0; i<n; i++) { if ( str.l ) kputc(',', &str); kputs(smpls[i], &str); free(smpls[i]); } free(smpls); } else if ( !args->estimate_AF ) kputs(args->sample, &str); if ( str.l ) { int ret = bcf_hdr_set_samples(args->hdr, str.s, 0); if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s); else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret); } if ( args->af_tag ) if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) ) error("No such INFO tag in the VCF: %s\n", args->af_tag); args->nsmpl = bcf_hdr_nsamples(args->hdr); args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample); free(str.s); int i; for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.); // Init transition matrix and HMM double tprob[4]; MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW; MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ; MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; if ( args->genmap_fname ) { args->hmm = hmm_init(2, tprob, 0); hmm_set_tprob_func(args->hmm, set_tprob_genmap, args); } else if ( args->rec_rate > 0 ) { args->hmm = hmm_init(2, tprob, 0); hmm_set_tprob_func(args->hmm, set_tprob_recrate, args); } else args->hmm = hmm_init(2, tprob, 10000); // print header printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version()); printf("# The command line was:\tbcftools %s", args->argv[0]); for (i=1; i<args->argc; i++) printf(" %s",args->argv[i]); printf("\n#\n"); printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n"); }
static int filters_init1(filter_t *filter, char *str, int len, token_t *tok) { tok->tok_type = TOK_VAL; tok->hdr_id = -1; tok->pass = -1; // is this a string constant? if ( str[0]=='"' || str[0]=='\'' ) { int quote = str[0]; if ( str[len-1] != quote ) error("TODO: [%s]\n", filter->str); tok->key = (char*) calloc(len-1,sizeof(char)); tok->num_value = len-2; memcpy(tok->key,str+1,len-2); tok->key[len-2] = 0; return 0; } if ( !strncmp(str,"INFO/",5) ) { str += 5; len -= 5; } if ( !strncmp(str,"%QUAL",len) ) { tok->setter = filters_set_qual; tok->tag = strdup("%QUAL"); return 0; } if ( !strncmp(str,"%TYPE",len) ) { tok->setter = filters_set_type; tok->tag = strdup("%TYPE"); return 0; } if ( !strncmp(str,"%FILTER",len) ) { tok->setter = filters_set_filter; tok->comparator = filters_cmp_filter; tok->tag = strdup("%FILTER"); return 0; } // is this one of the VCF tags? For now do only INFO and QUAL, to be extended... kstring_t tmp = {0,0,0}; kputsn(str, len, &tmp); tok->hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, tmp.s); if ( tok->hdr_id>=0 ) { if ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) == BCF_HT_FLAG ) tok->setter = filters_set_info_flag; else tok->setter = filters_set_info; tok->tag = strdup(tmp.s); if ( tmp.s ) free(tmp.s); return 0; } // is it a substrict VCF vector tag? if ( tmp.s[tmp.l-1] == ']' ) { int i; for (i=0; i<tmp.l; i++) if ( tmp.s[i]=='[' ) { tmp.s[i] = 0; break; } tok->hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, tmp.s); if ( tok->hdr_id>=0 ) { switch ( bcf_hdr_id2type(filter->hdr,BCF_HL_INFO,tok->hdr_id) ) { case BCF_HT_INT: tok->setter = &filters_set_info_int; break; case BCF_HT_REAL: tok->setter = &filters_set_info_float; break; default: error("FIXME: not ready for this, sorry\n"); } tok->idx = atoi(&tmp.s[i+1]); if ( tmp.s ) free(tmp.s); return 0; } } // is it a value? char *end; errno = 0; tok->threshold = strtod(tmp.s, &end); if ( errno!=0 || end!=tmp.s+len ) error("[%s:%d %s] Error: the tag \"INFO/%s\" is not defined in the VCF header\n", __FILE__,__LINE__,__FUNCTION__,tmp.s); if ( tmp.s ) free(tmp.s); return 0; }
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { memset(&args,0,sizeof(args_t)); int i; static struct option loptions[] = { {"help", no_argument, 0,'h'}, {"sample-list", required_argument, 0,'s'}, {0,0,0,0} }; char **smps_strs = NULL; int c; while ((c = getopt_long(argc, argv, "?s:h",loptions,NULL)) >= 0) { switch (c) { case 's': smps_strs = hts_readlist(optarg,0,&(args.n_sel_smps)); if ( args.n_sel_smps == 0 ) { fprintf(stderr, "Sample specification not valid.\n"); error("%s", usage()); } break; case 'h': usage(); break; case '?': default: error("%s", usage()); break; } } if ( optind != argc ) usage(); // too many files given args.hdr = bcf_hdr_dup(in); // Samples parsing from header and input option if ( !bcf_hdr_nsamples(args.hdr) ) { error("No samples in input file.\n"); } args.nsmp = bcf_hdr_nsamples(args.hdr); args.selected_smps = (int*) calloc(args.nsmp,sizeof(int)); for ( i = 0; i < args.n_sel_smps; i++ ) { int ind = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, smps_strs[i]); if ( ind == -1 ) { error("Sample '%s' not in input vcf file.\n", smps_strs[i]); } else { args.selected_smps[ind] = 1; } free(smps_strs[i]); } free(smps_strs); /* fprintf(stderr, "Selected samples array:["); for (i=0;i<args.nsmp;i++) { fprintf(stderr, " %i", args.selected_smps[i]); } fprintf(stderr, " ]\n"); */ if ( bcf_hdr_id2int(args.hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header\n", __func__); args.gt_arr = NULL; return 0; }
int main(int argc, char **argv) { int i, n; static struct option const long_opts[] = { {"out", required_argument, NULL, 1}, {"report", required_argument, NULL, 2}, {"dotasref", no_argument, NULL, 3}, {"help", no_argument, NULL, 0}, {"version", no_argument, NULL, 4}, {"export_uncov", no_argument, NULL, 5} }; bool help = FALSE; bool report_version = FALSE; while ((n = getopt_long(argc, argv, "1:2:304", long_opts, NULL)) >= 0) { switch (n) { case 1 : outfile = strdup(optarg); break; case 2 : report = strdup(optarg); break; case 3 : dotasref = TRUE; break; case 0 : help = TRUE; break; case 4 : report_version = TRUE; break; case 5 : export_uncover = TRUE; break; default : return 1; } if ( help ) return usage(); if ( report_version ) return show_version(); } n = argc - optind; if ( n > 1 ) errabort("only accept one input vcf"); if ( export_uncover == TRUE && outfile == FALSE) { warnings("export uncove region only used with option --out"); export_uncover = FALSE; } char * input; if ( n == 0 ) input = strdup("-"); else input = strdup(argv[optind]); htsFile * fp = read_vcf_file(input); enum htsExactFormat fmt = hts_get_format(fp)->format; if ( fmt != vcf && fmt != bcf ) errabort("This is not a VCF/BCF file : %s", input); bcf_hdr_t * hdr = bcf_hdr_read(fp); int n_samples = bcf_hdr_nsamples(hdr); if ( n_samples != 2 ) errabort("the input VCF/BCF file must contain only two samples! %d", n_samples); LOG("Using sample %s as ref ...", hdr->samples[0]); LOG("Using sample %s as test ...", hdr->samples[1]); uint32_t matrix[4][4] = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} }; bcf1_t * v = bcf_init1(); kstring_t str = { 0, 0, 0 }; uint32_t line = 0; htsFile *out = NULL; if ( outfile && !check_filename(outfile) ) out = hts_open(outfile, mode); if ( out != NULL ) bcf_hdr_write(out, hdr); while ( bcf_read1(fp, hdr, v) >= 0 ) { bcf_unpack(v, BCF_UN_STR|BCF_UN_FMT); int k; str.l = 0; int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT"); if ( !bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, tag_id) ) warnings("There is no 'GT' in the header!"); for ( i = 0; i < v->n_fmt; ++i ) if ( v->d.fmt[i].id == tag_id ) break; if ( i == v->n_fmt ) { vcf_format1(hdr, v, &str); LOG("There is no tag GT in this line : %s", str.s); continue; } corr_t xy[2] = { {-1, -2, -2}, {-1, -2, -2} }; bcf_fmt_t * fmt = &v->d.fmt[i]; for ( i = 0; i < 2; ++i ) { int corr = i; if ( fmt == NULL ) { if ( dotasref == TRUE ) xy[corr].alt = ALT_IS_REF; else xy[corr].alt = ALT_IS_UNC; continue; } int last = -2; uint8_t *d = (uint8_t*)((char*)fmt->p + fmt->size*i); for ( k = 0; k < fmt->n && d[k] != (uint8_t)bcf_int8_vector_end; ++k ) { int curr = d[k]>>1; if ( last != curr ) { if ( curr ) { if ( last == -2 ) xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF; else xy[corr].alt = ALT_IS_HET; } else { xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC; } } else { if ( curr ) { xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF; } else { xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC; } } if (last == -2 ) { xy[corr].min = xy[corr].max = curr; } else { if ( curr < xy[corr].min ) xy[corr].min = curr; else if ( curr > xy[corr].max ) xy[corr].max = curr; } last = curr; } } matrix[xy[0].alt][xy[1].alt]++; if ( xy[0].alt != xy[1].alt && out != NULL) { if ( xy[0].alt == ALT_IS_UNC || xy[1].alt == ALT_IS_UNC ) { if ( export_uncover == TRUE ) { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } else { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } if ( xy[0].alt == ALT_IS_HET && xy[1].alt == ALT_IS_HET && (xy[0].min != xy[1].min || xy[0].max != xy[1].max ) ) { bias++; matrix[ALT_IS_HET][ALT_IS_HET]--; if ( out != NULL ) { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } line++; } if ( out ) hts_close(out); if ( str.m ) free(str.s); write_report(matrix, hdr); bcf_hdr_destroy(hdr); free(input); bcf_destroy1(v); if ( outfile ) free(outfile); if ( report ) free(report); if ( hts_close(fp) ) warnings("hts_close returned non-zero status: %s", input); return 0; }
int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) { int i, j, nsmpl, free_smpl = 0; char **smpl = NULL; void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL; if ( exclude || strcmp("-",fname) ) // "-" stands for all samples { smpl = hts_readlist(fname, is_file, &nsmpl); if ( !smpl ) { fprintf(stderr,"Could not read the file: \"%s\"\n", fname); return 0; } if ( exclude ) { for (i=0; i<nsmpl; i++) khash_str2int_inc(exclude, smpl[i]); } free_smpl = 1; } if ( !smpl ) { smpl = files->readers[0].header->samples; // intersection of all samples nsmpl = bcf_hdr_nsamples(files->readers[0].header); } files->samples = NULL; files->n_smpl = 0; for (i=0; i<nsmpl; i++) { if ( exclude && khash_str2int_has_key(exclude,smpl[i]) ) continue; int n_isec = 0; for (j=0; j<files->nreaders; j++) { if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break; n_isec++; } if ( n_isec!=files->nreaders ) { fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname); continue; } files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*)); files->samples[files->n_smpl++] = strdup(smpl[i]); } if ( exclude ) khash_str2int_destroy(exclude); if ( free_smpl ) { for (i=0; i<nsmpl; i++) free(smpl[i]); free(smpl); } if ( !files->n_smpl ) { if ( files->nreaders>1 ) fprintf(stderr,"No samples in common.\n"); return 0; } for (i=0; i<files->nreaders; i++) { bcf_sr_t *reader = &files->readers[i]; reader->samples = (int*) malloc(sizeof(int)*files->n_smpl); reader->n_smpl = files->n_smpl; for (j=0; j<files->n_smpl; j++) reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]); } return 1; }
void abcWriteBcf::print(funkyPars *pars){ if(doBcf==0) return; kstring_t buf; if(fp==NULL){ buf.s=NULL;buf.l=buf.m=0; fp=aio::openFileHts(outfiles,".bcf"); hdr = bcf_hdr_init("w"); rec = bcf_init1(); print_bcf_header(fp,hdr,args,buf,header); } lh3struct *lh3 = (lh3struct*) pars->extras[5]; freqStruct *freq = (freqStruct *) pars->extras[6]; genoCalls *geno = (genoCalls *) pars->extras[10]; for(int s=0;s<pars->numSites;s++){ if(pars->keepSites[s]==0) continue; rec->rid = bcf_hdr_name2id(hdr,header->target_name[pars->refId]); rec->pos = pars->posi[s];//<- maybe one index? // bcf_update_id(hdr, rec, "rs6054257"); char majmin[4]={intToRef[pars->major[s]],',',intToRef[pars->minor[s]],'\0'}; bcf_update_alleles_str(hdr, rec, majmin); rec->qual = 29; // .. FILTER int32_t tmpi = bcf_hdr_id2int(hdr, BCF_DT_ID, "PASS"); bcf_update_filter(hdr, rec, &tmpi, 1); // .. INFO tmpi = pars->keepSites[s]; bcf_update_info_int32(hdr, rec, "NS", &tmpi, 1); if(pars->counts){ int depth = 0; for(int i=0; i<4*pars->nInd; i++) depth += pars->counts[s][i]; tmpi = depth; bcf_update_info_int32(hdr, rec, "DP", &tmpi, 1); } if(freq){ float tmpf = freq->freq_EM[s]; bcf_update_info_float(hdr, rec, "AF", &tmpf, 1); } // .. FORMAT assert(geno); if(geno){ int32_t *tmpia = (int*)malloc(bcf_hdr_nsamples(hdr)*2*sizeof(int32_t)); for(int i=0; i<pars->nInd;i++){ if(geno->dat[s][i]==0){ tmpia[2*i+0] = bcf_gt_unphased(0); tmpia[2*i+1] = bcf_gt_unphased(0); }else if(geno->dat[s][i]==1){ tmpia[2*i+0] = bcf_gt_unphased(0); tmpia[2*i+1] = bcf_gt_unphased(1); } else{ tmpia[2*i+0] = bcf_gt_unphased(1); tmpia[2*i+1] = bcf_gt_unphased(1); } } bcf_update_genotypes(hdr, rec, tmpia, bcf_hdr_nsamples(hdr)*2); free(tmpia); } if(pars->counts){ int32_t *tmpfa = (int32_t*)malloc(sizeof(int32_t)*bcf_hdr_nsamples(hdr)); suint *ary=pars->counts[s]; for(int i=0;i<bcf_hdr_nsamples(hdr);i++) tmpfa[i] = ary[0]+ary[1]+ary[2]+ary[3]; bcf_update_format_int32(hdr, rec, "DP", tmpfa,bcf_hdr_nsamples(hdr) ); free(tmpfa); } assert(lh3); if(lh3){ float *tmpfa = (float*)malloc(3*bcf_hdr_nsamples(hdr)*sizeof(float )); int32_t *tmpi = (int32_t*)malloc(3*bcf_hdr_nsamples(hdr)*sizeof(int32_t)); double *ary = lh3->lh3[s]; for(int i=0;i<bcf_hdr_nsamples(hdr);i++) for(int j=0;j<3;j++){ tmpfa[i*3+j] = ary[i*3+j]/M_LN10; tmpi[i*3+j] =(int) -log10(exp(ary[i*3+j]))*10.0; // fprintf(stderr,"pl:%d raw:%f\n",tmpi[i*3+j],ary[i*3+j]); } bcf_update_format_float(hdr, rec, "GL", tmpfa,3*bcf_hdr_nsamples(hdr) ); bcf_update_format_int32(hdr, rec, "PL", tmpi,3*bcf_hdr_nsamples(hdr) ); free(tmpfa); free(tmpi); } if ( bcf_write1(fp, hdr, rec)!=0 ){ fprintf(stderr,"Failed to write to \n"); exit(0); } // fprintf(stderr,"------\n"); bcf_clear1(rec); } }
VariantBuilder& VariantBuilder::set_chromosome(const std::string& chromosome) { // Note: we will validate the contig id (including checking for -1) at build time, if validation is turned on m_contig.set(bcf_hdr_id2int(m_header.m_header.get(), BCF_DT_CTG, chromosome.c_str())); return *this; }
static void init_data(args_t *args) { args->sr = bcf_sr_init(); if ( args->region ) { args->sr->require_index = 1; if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n",args->region); } if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->target); if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); args->hdr_in = bcf_sr_get_header(args->sr,0); args->hdr_out = bcf_hdr_dup(args->hdr_in); if ( args->filter_str ) args->filter = filter_init(args->hdr_in, args->filter_str); mkdir_p("%s/",args->output_dir); int i, nsmpl = bcf_hdr_nsamples(args->hdr_in); if ( !nsmpl ) error("No samples to split: %s\n", args->fname); args->fh = (htsFile**)calloc(nsmpl,sizeof(*args->fh)); args->bnames = set_file_base_names(args); kstring_t str = {0,0,0}; for (i=0; i<nsmpl; i++) { if ( !args->bnames[i] ) continue; str.l = 0; kputs(args->output_dir, &str); if ( str.s[str.l-1] != '/' ) kputc('/', &str); int k, l = str.l; kputs(args->bnames[i], &str); for (k=l; k<str.l; k++) if ( isspace(str.s[k]) ) str.s[k] = '_'; if ( args->output_type & FT_BCF ) kputs(".bcf", &str); else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); else kputs(".vcf", &str); args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); if ( args->fh[i] == NULL ) error("Can't write to \"%s\": %s\n", str.s, strerror(errno)); bcf_hdr_nsamples(args->hdr_out) = 1; args->hdr_out->samples[0] = args->bnames[i]; bcf_hdr_write(args->fh[i], args->hdr_out); } free(str.s); // parse tags int is_info = 0, is_fmt = 0; char *beg = args->keep_tags; while ( beg && *beg ) { if ( !strncasecmp("INFO/",beg,5) ) { is_info = 1; is_fmt = 0; beg += 5; } else if ( !strcasecmp("INFO",beg) ) { args->keep_info = 1; break; } else if ( !strncasecmp("INFO,",beg,5) ) { args->keep_info = 1; beg += 5; continue; } else if ( !strncasecmp("FMT/",beg,4) ) { is_info = 0; is_fmt = 1; beg += 4; } else if ( !strncasecmp("FORMAT/",beg,7) ) { is_info = 0; is_fmt = 1; beg += 7; } else if ( !strcasecmp("FMT",beg) ) { args->keep_fmt = 1; break; } else if ( !strcasecmp("FORMAT",beg) ) { args->keep_fmt = 1; break; } else if ( !strncasecmp("FMT,",beg,4) ) { args->keep_fmt = 1; beg += 4; continue; } else if ( !strncasecmp("FORMAT,",beg,7) ) { args->keep_fmt = 1; beg += 7; continue; } char *end = beg; while ( *end && *end!=',' ) end++; char tmp = *end; *end = 0; int id = bcf_hdr_id2int(args->hdr_in, BCF_DT_ID, beg); beg = tmp ? end + 1 : end; if ( is_info && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_INFO,id) ) { if ( id >= args->ninfo_tags ) args->ninfo_tags = id + 1; hts_expand0(uint8_t, args->ninfo_tags, args->minfo_tags, args->info_tags); args->info_tags[id] = 1; } if ( is_fmt && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_FMT,id) ) { if ( id >= args->nfmt_tags ) args->nfmt_tags = id + 1; hts_expand0(uint8_t, args->nfmt_tags, args->mfmt_tags, args->fmt_tags); args->fmt_tags[id] = 1; } } if ( !args->keep_info && !args->keep_fmt && !args->ninfo_tags && !args->nfmt_tags ) { args->keep_info = args->keep_fmt = 1; } }
// Parse filter expression and convert to reverse polish notation. Dijkstra's shunting-yard algorithm filter_t *filter_init(bcf_hdr_t *hdr, const char *str) { filter_t *filter = (filter_t *) calloc(1,sizeof(filter_t)); filter->str = strdup(str); filter->hdr = hdr; int nops = 0, mops = 0, *ops = NULL; // operators stack int nout = 0, mout = 0; // filter tokens, RPN token_t *out = NULL; char *tmp = filter->str; int last_op = -1; while ( *tmp ) { int len, ret; ret = filters_next_token(&tmp, &len); if ( ret==-1 ) error("Missing quotes in: %s\n", str); // fprintf(stderr,"token=[%c] .. [%s] %d\n", "x()[<=>]!|&+-*/Mm"[ret], tmp, len); // int i; for (i=0; i<nops; i++) fprintf(stderr," .%c.", "x()[<=>]!|&+-*/Mm"[ops[i]]); fprintf(stderr,"\n"); if ( ret==TOK_MAX || ret==TOK_MIN || ret==TOK_AVG ) { nout++; hts_expand0(token_t, nout, mout, out); filters_init_func(filter, ret, &tmp, &out[nout-1]); } else if ( ret==TOK_LFT ) // left bracket { nops++; hts_expand(int, nops, mops, ops); ops[nops-1] = ret; } else if ( ret==TOK_RGT ) // right bracket { while ( nops>0 && ops[nops-1]!=TOK_LFT ) { nout++; hts_expand0(token_t, nout, mout, out); out[nout-1].tok_type = ops[nops-1]; nops--; } if ( nops<=0 ) error("Could not parse: %s\n", str); nops--; } else if ( ret!=TOK_VAL ) // one of the operators { // detect unary minus: replace -value with -1*(value) if ( ret==TOK_SUB && last_op!=TOK_VAL && last_op!=TOK_RGT ) { nout++; hts_expand0(token_t, nout, mout, out); token_t *tok = &out[nout-1]; tok->tok_type = TOK_VAL; tok->hdr_id = -1; tok->pass = -1; tok->threshold = -1.0; ret = TOK_MULT; } else { while ( nops>0 && op_prec[ret] < op_prec[ops[nops-1]] ) { nout++; hts_expand0(token_t, nout, mout, out); out[nout-1].tok_type = ops[nops-1]; nops--; } } nops++; hts_expand(int, nops, mops, ops); ops[nops-1] = ret; } else if ( !len ) { if ( *tmp && !isspace(*tmp) ) error("Could not parse the expression: [%s]\n", str); break; // all tokens read } else // annotation name or filtering value { nout++; hts_expand0(token_t, nout, mout, out); filters_init1(filter, tmp, len, &out[nout-1]); tmp += len; } last_op = ret; } while ( nops>0 ) { if ( ops[nops-1]==TOK_LFT || ops[nops-1]==TOK_RGT ) error("Could not parse the expression: [%s]\n", filter->str); nout++; hts_expand0(token_t, nout, mout, out); out[nout-1].tok_type = ops[nops-1]; nops--; } // In the special cases of %TYPE and %FILTER the BCF header IDs are yet unknown. Walk through the // list of operators and convert the strings (e.g. "PASS") to BCF ids. The string value token must be // just before or after the %FILTER token and they must be followed with a comparison operator. // This code is fragile: improve me. int i; for (i=0; i<nout; i++) { if ( out[i].tok_type!=TOK_VAL ) continue; if ( !out[i].tag ) continue; if ( !strcmp(out[i].tag,"%TYPE") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); int j = i+1; if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression: %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( !strcasecmp(out[j].key,"snp") || !strcasecmp(out[j].key,"snps") ) out[j].threshold = VCF_SNP; else if ( !strcasecmp(out[j].key,"indel") || !strcasecmp(out[j].key,"indels") ) out[j].threshold = VCF_INDEL; else if ( !strcasecmp(out[j].key,"mnp") || !strcasecmp(out[j].key,"mnps") ) out[j].threshold = VCF_MNP; else if ( !strcasecmp(out[j].key,"other") ) out[j].threshold = VCF_OTHER; else error("The type \"%s\" not recognised: %s\n", out[j].key, filter->str); out[j].tag = out[j].key; out[j].key = NULL; i = j; continue; } if ( !strcmp(out[i].tag,"%FILTER") ) { if ( i+1==nout ) error("Could not parse the expression: %s\n", filter->str); int j = i+1; if ( out[j].tok_type==TOK_EQ || out[j].tok_type==TOK_NE ) j = i - 1; if ( out[j].tok_type!=TOK_VAL || !out[j].key ) error("[%s:%d %s] Could not parse the expression, an unquoted string value perhaps? %s\n", __FILE__,__LINE__,__FUNCTION__, filter->str); if ( strcmp(".",out[j].key) ) { out[j].hdr_id = bcf_hdr_id2int(filter->hdr, BCF_DT_ID, out[j].key); if ( !bcf_hdr_idinfo_exists(filter->hdr,BCF_HL_FLT,out[j].hdr_id) ) error("The filter \"%s\" not present in the VCF header\n", out[j].key); } else out[j].hdr_id = -1; out[j].tag = out[j].key; out[j].key = NULL; out[i].hdr_id = out[j].hdr_id; i = j; continue; } } // filter_debug_print(out, nout); if ( mops ) free(ops); filter->filters = out; filter->nfilters = nout; filter->flt_stack = (token_t **)malloc(sizeof(token_t*)*nout); return filter; }
static void init_data(args_t *args) { bcf1_t *line = NULL; // With phased concat, the chunks overlap and come in the right order. To // avoid opening all files at once, store start positions to recognise need // for the next one. This way we can keep only two open chunks at once. if ( args->phased_concat ) { args->start_pos = (int*) malloc(sizeof(int)*args->nfnames); line = bcf_init(); } kstring_t str = {0,0,0}; int i, prev_chrid = -1; for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); int j; for (j=0; j<bcf_hdr_nsamples(hdr); j++) if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) ) error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); if ( args->phased_concat ) { int ret = bcf_read(fp, hdr, line); if ( ret!=0 ) args->start_pos[i] = -2; // empty file else { int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line)); args->start_pos[i] = chrid==prev_chrid ? line->pos : -1; prev_chrid = chrid; } } bcf_hdr_destroy(hdr); hts_close(fp); } free(str.s); if ( line ) bcf_destroy(line); args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int)); if ( args->phased_concat ) { bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">"); bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">"); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); bcf_hdr_write(args->out_fh, args->out_hdr); if ( args->allow_overlaps ) { args->files = bcf_sr_init(); args->files->require_index = 1; if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->remove_dups ) { if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS; else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The -D string \"%s\" not recognised.\n", args->remove_dups); } for (i=0; i<args->nfnames; i++) if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum)); } else if ( args->phased_concat ) { // Remove empty files from the list int nok = 0; while (1) { while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++; if ( nok==args->nfnames ) break; i = nok; while ( i<args->nfnames && args->start_pos[i]==-2 ) i++; if ( i==args->nfnames ) break; int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp; char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str; } for (i=nok; i<args->nfnames; i++) free(args->fnames[i]); args->nfnames = nok; for (i=1; i<args->nfnames; i++) if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] ) error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]); args->prev_chr = -1; args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->files = bcf_sr_init(); args->files->require_index = 1; args->ifname = 0; } }
int run(int argc, char **argv) { char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL; memset(&args,0,sizeof(args_t)); args.mode = 0; args.output_fname = "-"; static struct option loptions[] = { {"trio",1,0,'t'}, {"trio-file",1,0,'T'}, {"delete",0,0,'d'}, {"list",1,0,'l'}, {"count",0,0,'c'}, {"rules",1,0,'r'}, {"rules-file",1,0,'R'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0) { switch (c) { case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'R': rules_fname = optarg; break; case 'r': rules_string = optarg; break; case 'd': args.mode |= MODE_DELETE; break; case 'c': args.mode |= MODE_COUNT; break; case 'l': if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; else error("The argument not recognised: --list %s\n", optarg); break; case 't': trio_samples = optarg; break; case 'T': trio_file = optarg; break; case 'h': case '?': default: error("%s",usage()); break; } } if ( rules_fname ) args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args); else args.rules = init_rules(&args, rules_string); if ( !args.rules ) return -1; args.itr = regitr_init(args.rules); args.itr_ori = regitr_init(args.rules); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else error("%s",usage()); } else fname = argv[optind]; if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n"); if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; args.sr = bcf_sr_init(); if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); args.hdr = bcf_sr_get_header(args.sr, 0); args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); bcf_hdr_write(args.out_fh, args.hdr); int i, n = 0; char **list; if ( trio_samples ) { args.ntrios = 1; args.trios = (trio_t*) calloc(1,sizeof(trio_t)); list = hts_readlist(trio_samples, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); args.trios[0].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); for (i=0; i<n; i++) free(list[i]); free(list); } if ( trio_file ) { list = hts_readlist(trio_file, 1, &n); args.ntrios = n; args.trios = (trio_t*) calloc(n,sizeof(trio_t)); for (i=0; i<n; i++) { char *ss = list[i], *se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s: %s\n",trio_file, ss); *se = 0; args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s\n",trio_file); *se = 0; args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; if ( *ss=='\0' ) error("Could not parse %s\n",trio_file); args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss); free(list[i]); } free(list); } while ( bcf_sr_next_line(args.sr) ) { bcf1_t *line = bcf_sr_get_line(args.sr,0); line = process(line); if ( line ) { if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); bcf_write1(args.out_fh, args.hdr, line); } } fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); for (i=0; i<args.ntrios; i++) { trio_t *trio = &args.trios[i]; fprintf(stderr,"%d\t%d\t%d\t%s,%s,%s\n", trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) ); } free(args.gt_arr); free(args.trios); regitr_destroy(args.itr); regitr_destroy(args.itr_ori); regidx_destroy(args.rules); bcf_sr_destroy(args.sr); if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); return 0; }
static void init_data(args_t *args) { args->aux.srs = bcf_sr_init(); // Open files for input and output, initialize structures if ( args->targets ) { if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) error("Failed to read the targets: %s\n", args->targets); if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) { args->aux.srs->targets->missed_reg_handler = print_missed_line; args->aux.srs->targets->missed_reg_data = args; } } if ( args->regions ) { if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) error("Failed to read the targets: %s\n", args->regions); } int i; if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open: %s\n", args->bcf_fname); if ( args->nsamples && args->nsamples != bcf_hdr_nsamples(args->aux.srs->readers[0].header) ) { args->samples_map = (int *) malloc(sizeof(int)*args->nsamples); args->aux.hdr = bcf_hdr_subset(args->aux.srs->readers[0].header, args->nsamples, args->samples, args->samples_map); for (i=0; i<args->nsamples; i++) if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]); if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n"); } else { args->aux.hdr = bcf_hdr_dup(args->aux.srs->readers[0].header); for (i=0; i<args->nsamples; i++) if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) error("No such sample: %s\n", args->samples[i]); } // Reorder ploidy and family indexes to match mpileup's output and exclude samples which are not available if ( args->aux.ploidy ) { for (i=0; i<args->aux.nfams; i++) { int j; for (j=0; j<3; j++) { int k = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[ args->aux.fams[i].sample[j] ]); if ( k<0 ) error("No such sample: %s\n", args->samples[ args->aux.fams[i].sample[j] ]); args->aux.fams[i].sample[j] = k; } } uint8_t *ploidy = (uint8_t*) calloc(bcf_hdr_nsamples(args->aux.hdr), 1); for (i=0; i<args->nsamples; i++) // i index in -s sample list { int j = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[i]); // j index in the output VCF / subset VCF if ( j<0 ) { fprintf(stderr,"Warning: no such sample: \"%s\"\n", args->samples[i]); continue; } ploidy[j] = args->aux.ploidy[i]; } args->nsamples = bcf_hdr_nsamples(args->aux.hdr); for (i=0; i<args->nsamples; i++) assert( ploidy[i]==0 || ploidy[i]==1 || ploidy[i]==2 ); free(args->aux.ploidy); args->aux.ploidy = ploidy; } args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->flag & CF_QCALL ) return; if ( args->flag & CF_MCALL ) mcall_init(&args->aux); if ( args->flag & CF_CCALL ) ccall_init(&args->aux); if ( args->flag&CF_GVCF ) { bcf_hdr_append(args->aux.hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); args->gvcf.rid = -1; args->gvcf.line = bcf_init1(); args->gvcf.gt = (int32_t*) malloc(2*sizeof(int32_t)*bcf_hdr_nsamples(args->aux.hdr)); for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) { args->gvcf.gt[2*i+0] = bcf_gt_unphased(0); args->gvcf.gt[2*i+1] = bcf_gt_unphased(0); } } bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS"); bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); bcf_hdr_write(args->out_fh, args->aux.hdr); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); }
static void init_data(args_t *args) { args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); args->hdr = args->files->readers[0].header; args->flt_pass = bcf_hdr_id2int(args->hdr,BCF_DT_ID,"PASS"); assert( !args->flt_pass ); // sanity check: required by BCF spec // -i or -e: append FILTER line if ( args->soft_filter && args->filter_logic ) { kstring_t flt_name = {0,0,0}; if ( strcmp(args->soft_filter,"+") ) kputs(args->soft_filter, &flt_name); else { // Make up a filter name int i = 0, id = -1; do { ksprintf(&flt_name,"Filter%d", ++i); id = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s); } while ( bcf_hdr_idinfo_exists(args->hdr,BCF_HL_FLT,id) ); } // escape quotes kstring_t tmp = {0,0,0}; char *t = args->filter_str; while ( *t ) { if ( *t=='"' ) kputc('\\',&tmp); kputc(*t,&tmp); t++; } int ret = bcf_hdr_printf(args->hdr, "##FILTER=<ID=%s,Description=\"Set if %s: %s\">", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s); if ( ret!=0 ) error("Failed to append header line: ##FILTER=<ID=%s,Description=\"Set if %s: %s\">\n", flt_name.s,args->filter_logic & FLT_INCLUDE ? "not true" : "true", tmp.s); args->flt_fail = bcf_hdr_id2int(args->hdr,BCF_DT_ID,flt_name.s); assert( args->flt_fail>=0 ); free(flt_name.s); free(tmp.s); } if ( args->snp_gap || args->indel_gap ) { if ( !args->filter_logic && args->soft_filter && strcmp(args->soft_filter,"+") ) { kstring_t tmp = {0,0,0}; if ( args->snp_gap ) kputs("\"SnpGap\"", &tmp); if ( args->indel_gap ) { if ( tmp.s ) kputs(" and ", &tmp); kputs("\"IndelGap\"", &tmp); } fprintf(stderr,"Warning: using %s filter name instead of \"%s\"\n", tmp.s,args->soft_filter); free(tmp.s); } rbuf_init(&args->rbuf, 64); args->rbuf_lines = (bcf1_t**) calloc(args->rbuf.m, sizeof(bcf1_t*)); if ( args->snp_gap ) { bcf_hdr_printf(args->hdr, "##FILTER=<ID=SnpGap,Description=\"SNP within %d bp of an indel\">", args->snp_gap); args->SnpGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "SnpGap"); assert( args->SnpGap_id>=0 ); } if ( args->indel_gap ) { bcf_hdr_printf(args->hdr, "##FILTER=<ID=IndelGap,Description=\"Indel within %d bp of an indel\">", args->indel_gap); args->IndelGap_id = bcf_hdr_id2int(args->hdr, BCF_DT_ID, "IndelGap"); assert( args->IndelGap_id>=0 ); } } if (args->record_cmd_line) bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_filter"); if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); }