bcf1_t *process(bcf1_t *rec) { float af; if(rec->n_allele==2) { bcf_get_format_float(in_hdr, rec, "GL", &gl, &ngl); assert(ngl==(3*n)); // bcf_get_genotypes(in_hdr, rec, >, &ngt); // assert(ngt==(2*n)); af = estimate_gt(); // fprintf(stderr,"%d AF=%f\n",rec->pos+1,af); bcf_update_format_float(out_hdr,rec,"DS",dosage,n); bcf_update_info_float(out_hdr,rec,"AF",&af,1); bcf_update_genotypes(out_hdr, rec, gt, ngt); } return rec; }
bcf1_t *process(bcf1_t *rec) { int i, n; if ( mode==GP_TO_GL ) { n = bcf_get_format_float(in_hdr,rec,"GP",&farr,&mfarr); for (i=0; i<n; i++) { if ( bcf_float_is_missing(farr[i]) || bcf_float_is_vector_end(farr[i]) ) continue; farr[i] = farr[i] ? log(farr[i]) : -99; } bcf_update_format_float(out_hdr,rec,"GL",farr,n); if ( drop_source_tag ) bcf_update_format_float(out_hdr,rec,"GP",NULL,0); } return rec; }
static void init_data(args_t *args) { bcf_srs_t *files = bcf_sr_init(); if ( args->regions_list ) { if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum)); bcf_hdr_t *hdr = files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = hdr->samples[0]; } else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample); int ret = bcf_hdr_set_samples(hdr, args->sample, 0); if ( ret<0 ) error("Error setting the sample: %s\n", args->sample); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) ) error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname); int i; args->xvals = (double*) calloc(args->nbins,sizeof(double)); for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1); // collect BAF distributions for all chromosomes int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1; float *baf = NULL; while ( bcf_sr_next_line(files) ) { ntotal++; bcf1_t *line = bcf_sr_get_line(files,0); if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue; if ( bcf_float_is_missing(baf[0]) ) continue; nprocessed++; if ( prev_chr==-1 || prev_chr!=line->rid ) { // new chromosome idist = args->ndist++; args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist); memset(&args->dist[idist],0,sizeof(dist_t)); args->dist[idist].chr = strdup(bcf_seqname(hdr,line)); args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double)); args->dist[idist].xvals = args->xvals; args->dist[idist].nvals = args->nbins; prev_chr = line->rid; } int bin = baf[0]*(args->nbins-1); args->dist[idist].yvals[bin]++; // the distribution } free(baf); bcf_sr_destroy(files); for (idist=0; idist<args->ndist; idist++) { #if 0 int j; for (j=0; j<args->nbins; j++) { double x = args->dist[idist].xvals[j]; args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3); } #endif init_dist(args, &args->dist[idist],args->verbose); } args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir); fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version()); fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]); for (i=1; i<args->argc; i++) fprintf(args->dat_fp, " %s",args->argv[i]); fprintf(args->dat_fp,"\n#\n"); fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n"); fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n"); fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n"); char *fname = NULL; FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir); //-------- matplotlib script -------------- fprintf(fp, "#!/usr/bin/env python\n" "#\n" "import matplotlib as mpl\n" "mpl.use('Agg')\n" "import matplotlib.pyplot as plt\n" "import csv,sys,argparse\n" "from math import exp\n" "\n" "outdir = '%s'\n" "\n" "def read_dat(dat,fit,cn):\n" " csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n" " with open(outdir+'/dist.dat', 'rb') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0][0]=='#': continue\n" " type = row[0]\n" " chr = row[1]\n" " if type=='DIST':\n" " if chr not in dat: dat[chr] = []\n" " dat[chr].append(row)\n" " elif type=='FIT':\n" " if chr not in fit: fit[chr] = []\n" " fit[chr].append(row)\n" " elif type=='CN':\n" " cn[chr] = row[2]\n" "\n" "def plot_dist(dat,fit,chr):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n" " if chr in fit:\n" " for i in range(len(fit[chr])):\n" " pfit = fit[chr][i]\n" " exec('def xfit(x): return '+pfit[5])\n" " istart = int(pfit[3])\n" " iend = int(pfit[4])+1\n" " vals = dat[chr][istart:iend]\n" " args = {}\n" " if i==0: args = {'label':'Target to Fit'}\n" " ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n" " if i==0: args = {'label':'Best Fit'}\n" " ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n" " ax.set_title('BAF distribution, chr'+chr)\n" " ax.set_xlabel('BAF')\n" " ax.set_ylabel('Frequency')\n" " ax.legend(loc='best',prop={'size':7},frameon=False)\n" " plt.savefig(outdir+'/dist.chr'+chr+'.png')\n" " plt.close()\n" "\n" "def plot_copy_number(cn):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " xlabels = sorted(cn.keys())\n" " xvals = range(len(xlabels))\n" " yvals = [float(cn[x]) for x in xlabels]\n" " ax.plot(xvals,yvals,'o',color='red')\n" " for i in range(len(xvals)):\n" " if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n" " ax.tick_params(axis='both', which='major', labelsize=9)\n" " ax.set_xticks(xvals)\n" " ax.set_xticklabels(xlabels,rotation=45)\n" " ax.set_xlim(-1,len(xlabels))\n" " ax.set_ylim(0,5.0)\n" " ax.set_yticks([1.0,2.0,3.0,4.0])\n" " ax.set_xlabel('Chromosome')\n" " ax.set_ylabel('Copy Number')\n" " plt.savefig(outdir+'/copy-number.png')\n" " plt.close()\n" "\n" "class myParser(argparse.ArgumentParser):\n" " def error(self, message):\n" " self.print_help()\n" " sys.stderr.write('error: %%s\\n' %% message)\n" " sys.exit(2)\n" "\n" "def main():\n" " parser = myParser()\n" " parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n" " parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n" " parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n" " args = parser.parse_args()\n" " dat = {}; fit = {}; cn = {}\n" " read_dat(dat,fit,cn)\n" " if args.distrib!=None:\n" " plot_dist(dat,fit,args.distrib)\n" " if args.all:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" " plot_copy_number(cn)\n" " elif args.copy_number:\n" " plot_copy_number(cn)\n" " else:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" "\n" "if __name__ == '__main__':\n" " main()\n", args->output_dir); //--------------------------------------- chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH); free(fname); fclose(fp); }
void BlockQuantify::count() { _impl->fasta_to_use.reset(new FastaFile(_impl->ref_fasta)); #ifdef DEBUG_BLOCKQUANTIFY int lastpos = 0; std::cerr << "starting block." << "\n"; #endif auto current_bs_start = _impl->variants.begin(); std::string current_chr; int current_bs = -1; bool current_bs_valid = false; // function to compute the QQ values for truth variants in the current // benchmarking superlocus const auto update_bs_qq = [this, ¤t_bs_start](BlockQuantifyImpl::variantlist_t::iterator to) { std::vector<float> tp_qqs; for(auto cur = current_bs_start; cur != to; ++cur) { const float qqq = bcfhelpers::getFormatFloat(_impl->hdr, *cur, "QQ", 1); if(std::isnan(qqq)) { continue; } const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 1); // we want the scores of all TPs in this BS if(bd == "TP") { tp_qqs.push_back(qqq); } } float t_qq = bcfhelpers::missing_float(); if(!tp_qqs.empty()) { t_qq = *(std::max_element(tp_qqs.begin(), tp_qqs.end())); } /** compute the median over all variants */ int fsize = bcf_hdr_nsamples(_impl->hdr); float * fmt = (float*)calloc((size_t) fsize, sizeof(float)); for(auto cur = current_bs_start; cur != to; ++cur) { const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 0); bcf_get_format_float(_impl->hdr, *cur, "QQ", &fmt, &fsize); if(bd != "TP") { fmt[0] = bcfhelpers::missing_float(); } else { fmt[0] = t_qq; } bcf_update_format_float(_impl->hdr, *cur, "QQ", fmt, fsize); } free(fmt); #ifdef DEBUG_BLOCKQUANTIFY const int bs = bcfhelpers::getInfoInt(_impl->hdr, *current_bs_start, "BS", -1); std::string values; for(float x : tp_qqs) { values += std::to_string(x) + ","; } std::cerr << "BS: " << bs << " T_QQ = " << t_qq << " [" << values << "]" << "\n"; #endif }; for(auto v_it = _impl->variants.begin(); v_it != _impl->variants.end(); ++v_it) { // update fields, must output GA4GH-compliant fields countVariants(*v_it); // determine benchmarking superlocus const std::string vchr = bcfhelpers::getChrom(_impl->hdr, *v_it); const int vbs = bcfhelpers::getInfoInt(_impl->hdr, *v_it, "BS"); if(!current_bs_valid) { current_bs = vbs; current_chr = vchr; current_bs_valid = true; } #ifdef DEBUG_BLOCKQUANTIFY std::cerr << "current BS = " << current_bs << " vbs = " << vbs << "\n"; #endif if( current_bs_start != v_it && (vbs != current_bs || vbs < 0 || vchr != current_chr)) { update_bs_qq(v_it); current_bs = vbs; current_chr = vchr; current_bs_start = v_it; } } // write out final superlocus (if any) update_bs_qq(_impl->variants.end()); for(auto & v : _impl->variants) { #ifdef DEBUG_BLOCKQUANTIFY lastpos = v->pos; #endif // use BD and BVT to make ROCs rocEvaluate(v); } #ifdef DEBUG_BLOCKQUANTIFY std::cerr << "finished block " << lastpos << " - " << _impl->variants.size() << " records on thread " << std::this_thread::get_id() << "\n"; #endif _impl->fasta_to_use.reset(nullptr); }
void union_data::readGenotypesVCF(string fvcf,string region) { int n_includedG = 0; int n_excludedG_mult = 0; int n_excludedG_void = 0; int n_excludedG_user = 0; int n_includedS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files bcf_srs_t * sr = bcf_sr_init(); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!"); bcf_sr_set_regions(sr, region.c_str(), 0); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i0 = 0 ; i0 < n_samples ; i0 ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0]))); if (mappingS.back() >= 0) n_includedS++; } //Read genotype data int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL; float * ds_arr = NULL; bcf1_t * line; unsigned int linecount = 0; while(bcf_sr_next_line (sr)) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr); if (nds == n_samples || ngt == 2*n_samples) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); if (filter_genotype.check(sid)) { genotype_id.push_back(sid); genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid))); string genotype_ref = string(line->d.allele[0]); genotype_start.push_back(line->pos + 1); nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr); if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]); else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1); genotype_val.push_back(vector < float > (sample_count, 0.0)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i]; else { if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing; else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); } } } pair < string, int > temp (sid,n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } else n_excludedG_void ++; } else n_excludedG_mult ++; } //Finalize free(gt_arr); free(ds_arr); bcf_sr_destroy(sr); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded"); //if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]"); //if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!"); }
void BlockQuantify::count() { _impl->fasta_to_use.reset(new FastaFile(_impl->ref_fasta)); #ifdef DEBUG_BLOCKQUANTIFY int lastpos = 0; std::cerr << "starting block." << "\n"; #endif auto current_bs_start = _impl->variants.begin(); std::string current_chr; int current_bs = -1; bool current_bs_valid = false; // function to compute the QQ values for truth variants in the current // benchmarking superlocus const auto update_bs_filters = [this, ¤t_bs_start](BlockQuantifyImpl::variantlist_t::iterator to) { std::set<int> bs_filters; for(auto cur = current_bs_start; cur != to; ++cur) { for(int nf = 0; nf < (*cur)->d.n_flt; ++nf) { const int f = (*cur)->d.flt[nf]; if(f != bcf_hdr_id2int(_impl->hdr, BCF_DT_ID, "PASS")) { bs_filters.insert(f); } } } if(bs_filters.empty()) { return; } for(auto cur = current_bs_start; cur != to; ++cur) { const std::string bdt = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 0); const std::string bvq = bcfhelpers::getFormatString(_impl->hdr, *cur, "BVT", 1); // filter TPs where the query call in NOCALL if(bdt == "TP" && bvq == "NOCALL") { for(auto f : bs_filters) { bcf_add_filter(_impl->hdr, *cur, f); } } } }; // function to compute the QQ values for truth variants in the current // benchmarking superlocus const auto update_bs_qq = [this, ¤t_bs_start](BlockQuantifyImpl::variantlist_t::iterator to) { std::vector<float> tp_qqs; for(auto cur = current_bs_start; cur != to; ++cur) { const float qqq = bcfhelpers::getFormatFloat(_impl->hdr, *cur, "QQ", 1); if(std::isnan(qqq)) { continue; } const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 1); // we want the scores of all TPs in this BS if(bd == "TP") { tp_qqs.push_back(qqq); } } float t_qq = bcfhelpers::missing_float(); if(!tp_qqs.empty()) { t_qq = *(std::min_element(tp_qqs.begin(), tp_qqs.end())); } /** compute the median over all variants */ int fsize = bcf_hdr_nsamples(_impl->hdr); float * fmt = (float*)calloc((size_t) fsize, sizeof(float)); for(auto cur = current_bs_start; cur != to; ++cur) { const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 0); bcf_get_format_float(_impl->hdr, *cur, "QQ", &fmt, &fsize); if(bd != "TP") { fmt[0] = bcfhelpers::missing_float(); } else { const float qqq = bcfhelpers::getFormatFloat(_impl->hdr, *cur, "QQ", 1); const std::string bd = bcfhelpers::getFormatString(_impl->hdr, *cur, "BD", 1); if(bd == "TP" && !std::isnan(qqq)) { fmt[0] = qqq; } else { fmt[0] = t_qq; } } bcf_update_format_float(_impl->hdr, *cur, "QQ", fmt, fsize); } free(fmt); #ifdef DEBUG_BLOCKQUANTIFY const int bs = bcfhelpers::getInfoInt(_impl->hdr, *current_bs_start, "BS", -1); std::string values; for(float x : tp_qqs) { values += std::to_string(x) + ","; } std::cerr << "BS: " << bs << " T_QQ = " << t_qq << " [" << values << "]" << "\n"; #endif }; const auto update_bs_conf_boundary_flag = [this, ¤t_bs_start](BlockQuantifyImpl::variantlist_t::iterator to) { static const int has_conf = 1; static const int has_non_conf = 2; int conf_non_conf = 0; for(auto cur = current_bs_start; cur != to; ++cur) { const std::string regions = bcfhelpers::getInfoString(_impl->hdr, *cur, "Regions", ""); if(regions.find("CONF") == std::string::npos) { conf_non_conf |= has_non_conf; } else { conf_non_conf |= has_conf; } if(regions.find("TS_boundary") != std::string::npos) { conf_non_conf |= has_non_conf | has_conf; } } for(auto cur = current_bs_start; cur != to; ++cur) { const std::string regions = bcfhelpers::getInfoString(_impl->hdr, *cur, "Regions", ""); if(conf_non_conf == (has_conf | has_non_conf)) { if(regions.find("TS_boundary") == std::string::npos) { bcf_update_info_string(_impl->hdr, *cur, "Regions", (regions.empty() ? "TS_boundary" : regions + ",TS_boundary").c_str()); } } else if(conf_non_conf == has_conf) { if(regions.find("TS_contained") == std::string::npos) { // also flag fully confident superloci bcf_update_info_string(_impl->hdr, *cur, "Regions", (regions.empty() ? "TS_contained" : regions + ",TS_contained").c_str()); } } } }; for(auto v_it = _impl->variants.begin(); v_it != _impl->variants.end(); ++v_it) { // update fields, must output GA4GH-compliant fields countVariants(*v_it); // determine benchmarking superlocus const std::string vchr = bcfhelpers::getChrom(_impl->hdr, *v_it); const int vbs = bcfhelpers::getInfoInt(_impl->hdr, *v_it, "BS"); if(!current_bs_valid) { current_bs = vbs; current_chr = vchr; current_bs_valid = true; } #ifdef DEBUG_BLOCKQUANTIFY std::cerr << "current BS = " << current_bs << " vbs = " << vbs << "\n"; #endif if( current_bs_start != v_it && (vbs != current_bs || vbs < 0 || vchr != current_chr)) { #ifdef DEBUG_BLOCKQUANTIFY std::cerr << "finishing BS = " << current_bs << " vbs = " << vbs << "\n"; #endif update_bs_qq(v_it); update_bs_filters(v_it); update_bs_conf_boundary_flag(v_it); current_bs = vbs; current_chr = vchr; current_bs_start = v_it; } } // do final superlocus (if any) update_bs_qq(_impl->variants.end()); update_bs_filters(_impl->variants.end()); update_bs_conf_boundary_flag(_impl->variants.end()); for(auto & v : _impl->variants) { #ifdef DEBUG_BLOCKQUANTIFY lastpos = v->pos; #endif // use BD and BVT to make ROCs rocEvaluate(v); } #ifdef DEBUG_BLOCKQUANTIFY std::cerr << "finished block " << lastpos << " - " << _impl->variants.size() << " records on thread " << std::this_thread::get_id() << "\n"; #endif _impl->fasta_to_use.reset(nullptr); }