static void init_data(args_t *args) { args->header = args->files->readers[0].header; int i, nsamples = 0, *samples = NULL; if ( args->sample_list && strcmp("-",args->sample_list) ) { for (i=0; i<args->files->nreaders; i++) { int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file); if ( ret<0 ) error("Error parsing the sample list\n"); else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret); } if ( args->sample_list[0]!='^' ) { // the sample ordering may be different if not negated int n; char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); if ( !smpls ) error("Could not parse %s\n", args->sample_list); if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) ) error("The number of samples does not match, perhaps some are present multiple times?\n"); nsamples = bcf_hdr_nsamples(args->files->readers[0].header); samples = (int*) malloc(sizeof(int)*nsamples); for (i=0; i<n; i++) { samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]); free(smpls[i]); } free(smpls); } } args->convert = convert_init(args->header, samples, nsamples, args->format_str); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); free(samples); int max_unpack = convert_max_unpack(args->convert); if ( args->filter_str ) { args->filter = filter_init(args->header, args->filter_str); max_unpack |= filter_max_unpack(args->filter); } args->files->max_unpack = max_unpack; }
static void init_data(args_t *args) { bcf_srs_t *files = bcf_sr_init(); if ( args->regions_list ) { if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum)); bcf_hdr_t *hdr = files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = hdr->samples[0]; } else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample); int ret = bcf_hdr_set_samples(hdr, args->sample, 0); if ( ret<0 ) error("Error setting the sample: %s\n", args->sample); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) ) error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname); int i; args->xvals = (double*) calloc(args->nbins,sizeof(double)); for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1); // collect BAF distributions for all chromosomes int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1; float *baf = NULL; while ( bcf_sr_next_line(files) ) { ntotal++; bcf1_t *line = bcf_sr_get_line(files,0); if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue; if ( bcf_float_is_missing(baf[0]) ) continue; nprocessed++; if ( prev_chr==-1 || prev_chr!=line->rid ) { // new chromosome idist = args->ndist++; args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist); memset(&args->dist[idist],0,sizeof(dist_t)); args->dist[idist].chr = strdup(bcf_seqname(hdr,line)); args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double)); args->dist[idist].xvals = args->xvals; args->dist[idist].nvals = args->nbins; prev_chr = line->rid; } int bin = baf[0]*(args->nbins-1); args->dist[idist].yvals[bin]++; // the distribution } free(baf); bcf_sr_destroy(files); for (idist=0; idist<args->ndist; idist++) { #if 0 int j; for (j=0; j<args->nbins; j++) { double x = args->dist[idist].xvals[j]; args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3); } #endif init_dist(args, &args->dist[idist],args->verbose); } args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir); fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version()); fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]); for (i=1; i<args->argc; i++) fprintf(args->dat_fp, " %s",args->argv[i]); fprintf(args->dat_fp,"\n#\n"); fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n"); fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n"); fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n"); char *fname = NULL; FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir); //-------- matplotlib script -------------- fprintf(fp, "#!/usr/bin/env python\n" "#\n" "import matplotlib as mpl\n" "mpl.use('Agg')\n" "import matplotlib.pyplot as plt\n" "import csv,sys,argparse\n" "from math import exp\n" "\n" "outdir = '%s'\n" "\n" "def read_dat(dat,fit,cn):\n" " csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n" " with open(outdir+'/dist.dat', 'rb') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0][0]=='#': continue\n" " type = row[0]\n" " chr = row[1]\n" " if type=='DIST':\n" " if chr not in dat: dat[chr] = []\n" " dat[chr].append(row)\n" " elif type=='FIT':\n" " if chr not in fit: fit[chr] = []\n" " fit[chr].append(row)\n" " elif type=='CN':\n" " cn[chr] = row[2]\n" "\n" "def plot_dist(dat,fit,chr):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n" " if chr in fit:\n" " for i in range(len(fit[chr])):\n" " pfit = fit[chr][i]\n" " exec('def xfit(x): return '+pfit[5])\n" " istart = int(pfit[3])\n" " iend = int(pfit[4])+1\n" " vals = dat[chr][istart:iend]\n" " args = {}\n" " if i==0: args = {'label':'Target to Fit'}\n" " ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n" " if i==0: args = {'label':'Best Fit'}\n" " ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n" " ax.set_title('BAF distribution, chr'+chr)\n" " ax.set_xlabel('BAF')\n" " ax.set_ylabel('Frequency')\n" " ax.legend(loc='best',prop={'size':7},frameon=False)\n" " plt.savefig(outdir+'/dist.chr'+chr+'.png')\n" " plt.close()\n" "\n" "def plot_copy_number(cn):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " xlabels = sorted(cn.keys())\n" " xvals = range(len(xlabels))\n" " yvals = [float(cn[x]) for x in xlabels]\n" " ax.plot(xvals,yvals,'o',color='red')\n" " for i in range(len(xvals)):\n" " if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n" " ax.tick_params(axis='both', which='major', labelsize=9)\n" " ax.set_xticks(xvals)\n" " ax.set_xticklabels(xlabels,rotation=45)\n" " ax.set_xlim(-1,len(xlabels))\n" " ax.set_ylim(0,5.0)\n" " ax.set_yticks([1.0,2.0,3.0,4.0])\n" " ax.set_xlabel('Chromosome')\n" " ax.set_ylabel('Copy Number')\n" " plt.savefig(outdir+'/copy-number.png')\n" " plt.close()\n" "\n" "class myParser(argparse.ArgumentParser):\n" " def error(self, message):\n" " self.print_help()\n" " sys.stderr.write('error: %%s\\n' %% message)\n" " sys.exit(2)\n" "\n" "def main():\n" " parser = myParser()\n" " parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n" " parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n" " parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n" " args = parser.parse_args()\n" " dat = {}; fit = {}; cn = {}\n" " read_dat(dat,fit,cn)\n" " if args.distrib!=None:\n" " plot_dist(dat,fit,args.distrib)\n" " if args.all:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" " plot_copy_number(cn)\n" " elif args.copy_number:\n" " plot_copy_number(cn)\n" " else:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" "\n" "if __name__ == '__main__':\n" " main()\n", args->output_dir); //--------------------------------------- chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH); free(fname); fclose(fp); }
//{{{ void print_query_result_offset(uint32_t *mask, void print_query_result_offset(uint32_t *mask, uint32_t mask_len, uint32_t *vids, struct gqt_query *q, uint32_t **counts, uint32_t *id_lens, uint32_t *U_R, uint32_t U_R_len, char **id_query_list, char **gt_query_list, uint32_t num_qs, uint32_t num_fields, char *off_file_name, char *source_file, char *full_cmd) { struct off_file *off_f = open_off_file(off_file_name); struct bcf_file bcf_f = init_bcf_file(source_file); char *sample_names = NULL; uint32_t i,j,k,line_idx,bytes, bit_i = 0; int r; for (i = 0; i < U_R_len; ++i) { if (i == 0 ) r = asprintf(&sample_names, "%s", bcf_f.hdr->samples[U_R[i]]); else r = asprintf(&sample_names, "%s,%s", sample_names, bcf_f.hdr->samples[U_R[i]]); if (r == -1) err(EX_OSERR, "asprintf error"); } if (bcf_hdr_set_samples(bcf_f.hdr, sample_names, 0) != 0) errx(EX_DATAERR, "Error setting samples: %s\n", source_file); char *info_s; for (i = 0; i < num_qs; i++) { if ( q[i].variant_op == p_count ) { r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Integer," "Description=\"GQT count result from " "phenotype:'%s' genotype:'%s'\">", i, id_query_list[i], gt_query_list[i]); if (r == -1) err(EX_OSERR, "asprintf error"); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); } else if ( q[i].variant_op == p_pct ) { r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Float," "Description=\"GQT percent result from " "phenotype:'%s' genotype:'%s'\">", i, id_query_list[i], gt_query_list[i]); if (r == -1) err(EX_OSERR, "asprintf error"); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); } else if ( q[i].variant_op == p_maf ) { r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Float," "Description=\"GQT maf result from " "phenotype:'%s' genotype:'%s'\">", i, id_query_list[i], gt_query_list[i]); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); } } r = asprintf(&info_s, "##%s_queryVersion=%s", PROGRAM_NAME, VERSION); if (r == -1) err(EX_OSERR, "asprintf error"); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); r = asprintf(&info_s, "##%s_queryCommand=%s", PROGRAM_NAME, full_cmd); if (r == -1) err(EX_OSERR, "asprintf error"); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); htsFile *out_f = hts_open("-","w"); if ( !out_f ) err(EX_DATAERR, "Could open output file"); bcf_hdr_write(out_f, bcf_f.hdr); bcf_f.line = bcf_init1(); for (i=0; i < mask_len; ++i) { bytes = mask[i]; if (bytes == 0) continue; /* skip a bunch of ops if you can */ for (j=0; j < 32; j++) { if (bytes & 1 << (31 - j)) { line_idx = i*32+j; r = goto_bcf_line(&bcf_f, off_f, line_idx); if (r == -1) err(EX_NOINPUT, "Error seeking file '%s'", bcf_f.file_name); r = get_bcf_line(&bcf_f); if (r == -1) err(EX_NOINPUT, "Error reading file '%s'", bcf_f.file_name); for (k=0; k < num_qs; k++) { r = asprintf(&info_s, "GQT_%u", k); if (r == -1) err(EX_OSERR, "asprintf error"); if ( q[k].variant_op == p_count ) { int32_t v = counts[k][line_idx]; if (bcf_update_info_int32(bcf_f.hdr, bcf_f.line, info_s, &v, 1) != 0) errx(EX_DATAERR, "Error adding to info field: %s\n", bcf_f.file_name); } else if (q[k].variant_op == p_pct) { float v = ((float)counts[k][line_idx])/ ((float) id_lens[k]); if (bcf_update_info_float(bcf_f.hdr, bcf_f.line, info_s, &v, 1) != 0) errx(EX_DATAERR, "Error adding to info field: %s\n", bcf_f.file_name); } else if (q[k].variant_op == p_maf) { float v = ((float)counts[k][line_idx])/ (((float) id_lens[k])*2.0); if (bcf_update_info_float(bcf_f.hdr, bcf_f.line, info_s, &v, 1) != 0) errx(EX_DATAERR, "Error adding to info field: %s\n", bcf_f.file_name); } } bcf_write(out_f, bcf_f.hdr, bcf_f.line); } bit_i++; if (bit_i == num_fields) break; } if (bit_i == num_fields) break; } hts_close(out_f); destroy_off_file(off_f); }
static void init_data(args_t *args) { args->prev_rid = args->skip_rid = -1; args->hdr = args->files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = strdup(args->hdr->samples[0]); } if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n"); // Set samples kstring_t str = {0,0,0}; if ( args->estimate_AF && strcmp("-",args->estimate_AF) ) { int i, n; char **smpls = hts_readlist(args->estimate_AF, 1, &n); // Make sure the query sample is included for (i=0; i<n; i++) if ( !strcmp(args->sample,smpls[i]) ) break; // Add the query sample if not present if ( i!=n ) kputs(args->sample, &str); for (i=0; i<n; i++) { if ( str.l ) kputc(',', &str); kputs(smpls[i], &str); free(smpls[i]); } free(smpls); } else if ( !args->estimate_AF ) kputs(args->sample, &str); if ( str.l ) { int ret = bcf_hdr_set_samples(args->hdr, str.s, 0); if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s); else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret); } if ( args->af_tag ) if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) ) error("No such INFO tag in the VCF: %s\n", args->af_tag); args->nsmpl = bcf_hdr_nsamples(args->hdr); args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample); free(str.s); int i; for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.); // Init transition matrix and HMM double tprob[4]; MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW; MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ; MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; if ( args->genmap_fname ) { args->hmm = hmm_init(2, tprob, 0); hmm_set_tprob_func(args->hmm, set_tprob_genmap, args); } else if ( args->rec_rate > 0 ) { args->hmm = hmm_init(2, tprob, 0); hmm_set_tprob_func(args->hmm, set_tprob_recrate, args); } else args->hmm = hmm_init(2, tprob, 10000); // print header printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version()); printf("# The command line was:\tbcftools %s", args->argv[0]); for (i=1; i<args->argc; i++) printf(" %s",args->argv[i]); printf("\n#\n"); printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n"); }
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { char *trio_samples = NULL, *unrelated_samples = NULL; memset(&args,0,sizeof(args_t)); args.prev_rid = -1; args.hdr = in; args.pij = 2e-8; args.pgt_err = 1e-9; static struct option loptions[] = { {"prefix",1,0,'p'}, {"trio",1,0,'t'}, {"unrelated",1,0,'u'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:u:p:",loptions,NULL)) >= 0) { switch (c) { case 'p': args.prefix = optarg; break; case 't': trio_samples = optarg; break; case 'u': unrelated_samples = optarg; break; case 'h': case '?': default: error("%s", usage()); break; } } if ( optind != argc ) error(usage()); if ( trio_samples && unrelated_samples ) error("Expected only one of the -t/-u options\n"); if ( !trio_samples && !unrelated_samples ) error("Expected one of the -t/-u options\n"); if ( !args.prefix ) error("Expected the -p option\n"); int ret = bcf_hdr_set_samples(args.hdr, trio_samples ? trio_samples : unrelated_samples, 0); if ( ret<0 ) error("Could not parse samples: %s\n", trio_samples ? trio_samples : unrelated_samples); else if ( ret>0 ) error("%d-th sample not found: %s\n", ret,trio_samples ? trio_samples : unrelated_samples); if ( trio_samples ) { int i,n = 0; char **list = hts_readlist(trio_samples, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); args.imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); args.ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); for (i=0; i<n; i++) free(list[i]); free(list); args.set_observed_prob = set_observed_prob_trio; args.mode = C_TRIO; init_hmm_trio(&args); } else { int i,n = 0; char **list = hts_readlist(unrelated_samples, 0, &n); if ( n!=2 ) error("Expected two sample names with -u\n"); args.isample = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.jsample = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); for (i=0; i<n; i++) free(list[i]); free(list); args.set_observed_prob = set_observed_prob_unrelated; args.mode = C_UNRL; init_hmm_unrelated(&args); } return 1; }