int main_vcfisec(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->files = bcf_sr_init(); args->argc = argc; args->argv = argv; args->output_fname = NULL; args->output_type = FT_VCF; args->n_threads = 0; args->record_cmd_line = 1; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = { {"help",no_argument,NULL,'h'}, {"exclude",required_argument,NULL,'e'}, {"include",required_argument,NULL,'i'}, {"collapse",required_argument,NULL,'c'}, {"complement",no_argument,NULL,'C'}, {"apply-filters",required_argument,NULL,'f'}, {"nfiles",required_argument,NULL,'n'}, {"prefix",required_argument,NULL,'p'}, {"write",required_argument,NULL,'w'}, {"targets",required_argument,NULL,'t'}, {"targets-file",required_argument,NULL,'T'}, {"regions",required_argument,NULL,'r'}, {"regions-file",required_argument,NULL,'R'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {"threads",required_argument,NULL,9}, {"no-version",no_argument,NULL,8}, {NULL,0,NULL,0} }; while ((c = getopt_long(argc, argv, "hc:r:R:p:n:w:t:T:Cf:o:O:i:e:",loptions,NULL)) >= 0) { switch (c) { case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'c': if ( !strcmp(optarg,"snps") ) args->files->collapse |= COLLAPSE_SNPS; else if ( !strcmp(optarg,"indels") ) args->files->collapse |= COLLAPSE_INDELS; else if ( !strcmp(optarg,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(optarg,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME; else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; case 'f': args->files->apply_filters = optarg; break; case 'C': args->isec_op = OP_COMPLEMENT; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'p': args->prefix = optarg; break; case 'w': args->write_files = optarg; break; case 'i': add_filter(args, optarg, FLT_INCLUDE); break; case 'e': add_filter(args, optarg, FLT_EXCLUDE); break; case 'n': { char *p = optarg; if ( *p=='-' ) { args->isec_op = OP_MINUS; p++; } else if ( *p=='+' ) { args->isec_op = OP_PLUS; p++; } else if ( *p=='=' ) { args->isec_op = OP_EQUAL; p++; } else if ( *p=='~' ) { args->isec_op = OP_EXACT; p++; } else if ( isdigit(*p) ) args->isec_op = OP_EQUAL; else error("Could not parse --nfiles %s\n", optarg); if ( args->isec_op == OP_EXACT ) args->isec_exact = p; else if ( sscanf(p,"%d",&args->isec_n)!=1 ) error("Could not parse --nfiles %s\n", optarg); } break; case 9 : args->n_threads = strtol(optarg, 0, 0); break; case 8 : args->record_cmd_line = 0; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); } } if ( argc-optind<1 ) usage(); // no file given if ( args->targets_list && bcf_sr_set_targets(args->files, args->targets_list, targets_is_file,0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); if ( argc-optind==2 && !args->isec_op ) { args->isec_op = OP_VENN; if ( !args->prefix ) error("Expected the -p option\n"); } if ( !args->targets_list ) { if ( argc-optind<2 ) error("Expected multiple files or the --targets option\n"); if ( !args->isec_op ) error("Expected two file names or one of the options --complement, --nfiles or --targets\n"); } args->files->require_index = 1; while (optind<argc) { if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum)); optind++; } init_data(args); isec_vcf(args); destroy_data(args); bcf_sr_destroy(args->files); free(args); return 0; }
static void init_data(args_t *args) { bcf1_t *line = NULL; // With phased concat, the chunks overlap and come in the right order. To // avoid opening all files at once, store start positions to recognise need // for the next one. This way we can keep only two open chunks at once. if ( args->phased_concat ) { args->start_pos = (int*) malloc(sizeof(int)*args->nfnames); line = bcf_init(); } kstring_t str = {0,0,0}; int i, prev_chrid = -1; for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); int j; for (j=0; j<bcf_hdr_nsamples(hdr); j++) if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) ) error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); if ( args->phased_concat ) { int ret = bcf_read(fp, hdr, line); if ( ret!=0 ) args->start_pos[i] = -2; // empty file else { int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line)); args->start_pos[i] = chrid==prev_chrid ? line->pos : -1; prev_chrid = chrid; } } bcf_hdr_destroy(hdr); hts_close(fp); } free(str.s); if ( line ) bcf_destroy(line); args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int)); if ( args->phased_concat ) { bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">"); bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">"); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); bcf_hdr_write(args->out_fh, args->out_hdr); if ( args->allow_overlaps ) { args->files = bcf_sr_init(); args->files->require_index = 1; if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->remove_dups ) { if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS; else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The -D string \"%s\" not recognised.\n", args->remove_dups); } for (i=0; i<args->nfnames; i++) if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum)); } else if ( args->phased_concat ) { // Remove empty files from the list int nok = 0; while (1) { while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++; if ( nok==args->nfnames ) break; i = nok; while ( i<args->nfnames && args->start_pos[i]==-2 ) i++; if ( i==args->nfnames ) break; int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp; char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str; } for (i=nok; i<args->nfnames; i++) free(args->fnames[i]); args->nfnames = nok; for (i=1; i<args->nfnames; i++) if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] ) error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]); args->prev_chr = -1; args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->files = bcf_sr_init(); args->files->require_index = 1; args->ifname = 0; } }
static void init_data(args_t *args) { bcf_srs_t *files = bcf_sr_init(); if ( args->regions_list ) { if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum)); bcf_hdr_t *hdr = files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = hdr->samples[0]; } else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample); int ret = bcf_hdr_set_samples(hdr, args->sample, 0); if ( ret<0 ) error("Error setting the sample: %s\n", args->sample); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) ) error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname); int i; args->xvals = (double*) calloc(args->nbins,sizeof(double)); for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1); // collect BAF distributions for all chromosomes int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1; float *baf = NULL; while ( bcf_sr_next_line(files) ) { ntotal++; bcf1_t *line = bcf_sr_get_line(files,0); if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue; if ( bcf_float_is_missing(baf[0]) ) continue; nprocessed++; if ( prev_chr==-1 || prev_chr!=line->rid ) { // new chromosome idist = args->ndist++; args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist); memset(&args->dist[idist],0,sizeof(dist_t)); args->dist[idist].chr = strdup(bcf_seqname(hdr,line)); args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double)); args->dist[idist].xvals = args->xvals; args->dist[idist].nvals = args->nbins; prev_chr = line->rid; } int bin = baf[0]*(args->nbins-1); args->dist[idist].yvals[bin]++; // the distribution } free(baf); bcf_sr_destroy(files); for (idist=0; idist<args->ndist; idist++) { #if 0 int j; for (j=0; j<args->nbins; j++) { double x = args->dist[idist].xvals[j]; args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3); } #endif init_dist(args, &args->dist[idist],args->verbose); } args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir); fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version()); fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]); for (i=1; i<args->argc; i++) fprintf(args->dat_fp, " %s",args->argv[i]); fprintf(args->dat_fp,"\n#\n"); fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n"); fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n"); fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n"); char *fname = NULL; FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir); //-------- matplotlib script -------------- fprintf(fp, "#!/usr/bin/env python\n" "#\n" "import matplotlib as mpl\n" "mpl.use('Agg')\n" "import matplotlib.pyplot as plt\n" "import csv,sys,argparse\n" "from math import exp\n" "\n" "outdir = '%s'\n" "\n" "def read_dat(dat,fit,cn):\n" " csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n" " with open(outdir+'/dist.dat', 'rb') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0][0]=='#': continue\n" " type = row[0]\n" " chr = row[1]\n" " if type=='DIST':\n" " if chr not in dat: dat[chr] = []\n" " dat[chr].append(row)\n" " elif type=='FIT':\n" " if chr not in fit: fit[chr] = []\n" " fit[chr].append(row)\n" " elif type=='CN':\n" " cn[chr] = row[2]\n" "\n" "def plot_dist(dat,fit,chr):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n" " if chr in fit:\n" " for i in range(len(fit[chr])):\n" " pfit = fit[chr][i]\n" " exec('def xfit(x): return '+pfit[5])\n" " istart = int(pfit[3])\n" " iend = int(pfit[4])+1\n" " vals = dat[chr][istart:iend]\n" " args = {}\n" " if i==0: args = {'label':'Target to Fit'}\n" " ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n" " if i==0: args = {'label':'Best Fit'}\n" " ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n" " ax.set_title('BAF distribution, chr'+chr)\n" " ax.set_xlabel('BAF')\n" " ax.set_ylabel('Frequency')\n" " ax.legend(loc='best',prop={'size':7},frameon=False)\n" " plt.savefig(outdir+'/dist.chr'+chr+'.png')\n" " plt.close()\n" "\n" "def plot_copy_number(cn):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " xlabels = sorted(cn.keys())\n" " xvals = range(len(xlabels))\n" " yvals = [float(cn[x]) for x in xlabels]\n" " ax.plot(xvals,yvals,'o',color='red')\n" " for i in range(len(xvals)):\n" " if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n" " ax.tick_params(axis='both', which='major', labelsize=9)\n" " ax.set_xticks(xvals)\n" " ax.set_xticklabels(xlabels,rotation=45)\n" " ax.set_xlim(-1,len(xlabels))\n" " ax.set_ylim(0,5.0)\n" " ax.set_yticks([1.0,2.0,3.0,4.0])\n" " ax.set_xlabel('Chromosome')\n" " ax.set_ylabel('Copy Number')\n" " plt.savefig(outdir+'/copy-number.png')\n" " plt.close()\n" "\n" "class myParser(argparse.ArgumentParser):\n" " def error(self, message):\n" " self.print_help()\n" " sys.stderr.write('error: %%s\\n' %% message)\n" " sys.exit(2)\n" "\n" "def main():\n" " parser = myParser()\n" " parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n" " parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n" " parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n" " args = parser.parse_args()\n" " dat = {}; fit = {}; cn = {}\n" " read_dat(dat,fit,cn)\n" " if args.distrib!=None:\n" " plot_dist(dat,fit,args.distrib)\n" " if args.all:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" " plot_copy_number(cn)\n" " elif args.copy_number:\n" " plot_copy_number(cn)\n" " else:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" "\n" "if __name__ == '__main__':\n" " main()\n", args->output_dir); //--------------------------------------- chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH); free(fname); fclose(fp); }
static void init_data(args_t *args) { args->sr = bcf_sr_init(); if ( args->region ) { args->sr->require_index = 1; if ( bcf_sr_set_regions(args->sr, args->region, args->region_is_file)<0 ) error("Failed to read the regions: %s\n",args->region); } if ( args->target && bcf_sr_set_targets(args->sr, args->target, args->target_is_file, 0)<0 ) error("Failed to read the targets: %s\n",args->target); if ( !bcf_sr_add_reader(args->sr,args->fname) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); args->hdr_in = bcf_sr_get_header(args->sr,0); args->hdr_out = bcf_hdr_dup(args->hdr_in); if ( args->filter_str ) args->filter = filter_init(args->hdr_in, args->filter_str); mkdir_p("%s/",args->output_dir); int i, nsmpl = bcf_hdr_nsamples(args->hdr_in); if ( !nsmpl ) error("No samples to split: %s\n", args->fname); args->fh = (htsFile**)calloc(nsmpl,sizeof(*args->fh)); args->bnames = set_file_base_names(args); kstring_t str = {0,0,0}; for (i=0; i<nsmpl; i++) { if ( !args->bnames[i] ) continue; str.l = 0; kputs(args->output_dir, &str); if ( str.s[str.l-1] != '/' ) kputc('/', &str); int k, l = str.l; kputs(args->bnames[i], &str); for (k=l; k<str.l; k++) if ( isspace(str.s[k]) ) str.s[k] = '_'; if ( args->output_type & FT_BCF ) kputs(".bcf", &str); else if ( args->output_type & FT_GZ ) kputs(".vcf.gz", &str); else kputs(".vcf", &str); args->fh[i] = hts_open(str.s, hts_bcf_wmode(args->output_type)); if ( args->fh[i] == NULL ) error("Can't write to \"%s\": %s\n", str.s, strerror(errno)); bcf_hdr_nsamples(args->hdr_out) = 1; args->hdr_out->samples[0] = args->bnames[i]; bcf_hdr_write(args->fh[i], args->hdr_out); } free(str.s); // parse tags int is_info = 0, is_fmt = 0; char *beg = args->keep_tags; while ( beg && *beg ) { if ( !strncasecmp("INFO/",beg,5) ) { is_info = 1; is_fmt = 0; beg += 5; } else if ( !strcasecmp("INFO",beg) ) { args->keep_info = 1; break; } else if ( !strncasecmp("INFO,",beg,5) ) { args->keep_info = 1; beg += 5; continue; } else if ( !strncasecmp("FMT/",beg,4) ) { is_info = 0; is_fmt = 1; beg += 4; } else if ( !strncasecmp("FORMAT/",beg,7) ) { is_info = 0; is_fmt = 1; beg += 7; } else if ( !strcasecmp("FMT",beg) ) { args->keep_fmt = 1; break; } else if ( !strcasecmp("FORMAT",beg) ) { args->keep_fmt = 1; break; } else if ( !strncasecmp("FMT,",beg,4) ) { args->keep_fmt = 1; beg += 4; continue; } else if ( !strncasecmp("FORMAT,",beg,7) ) { args->keep_fmt = 1; beg += 7; continue; } char *end = beg; while ( *end && *end!=',' ) end++; char tmp = *end; *end = 0; int id = bcf_hdr_id2int(args->hdr_in, BCF_DT_ID, beg); beg = tmp ? end + 1 : end; if ( is_info && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_INFO,id) ) { if ( id >= args->ninfo_tags ) args->ninfo_tags = id + 1; hts_expand0(uint8_t, args->ninfo_tags, args->minfo_tags, args->info_tags); args->info_tags[id] = 1; } if ( is_fmt && bcf_hdr_idinfo_exists(args->hdr_in,BCF_HL_FMT,id) ) { if ( id >= args->nfmt_tags ) args->nfmt_tags = id + 1; hts_expand0(uint8_t, args->nfmt_tags, args->mfmt_tags, args->fmt_tags); args->fmt_tags[id] = 1; } } if ( !args->keep_info && !args->keep_fmt && !args->ninfo_tags && !args->nfmt_tags ) { args->keep_info = args->keep_fmt = 1; } }
static void concat(args_t *args) { int i; if ( args->phased_concat ) // phased concat { // keep only two open files at a time while ( args->ifname < args->nfnames ) { int new_file = 0; while ( args->files->nreaders < 2 && args->ifname < args->nfnames ) { if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); new_file = 1; args->ifname++; if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position int seek_pos = -1; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line)); } else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start int nret; while ( (nret = bcf_sr_next_line(args->files)) ) { if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( ! bcf_sr_region_done(args->files,0) ) continue; phased_flush(args); bcf_sr_remove_reader(args->files, 0); } // Get a line to learn about current position for (i=0; i<args->files->nreaders; i++) if ( bcf_sr_has_line(args->files,i) ) break; bcf1_t *line = bcf_sr_get_line(args->files,i); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); args->ifname++; } if ( must_seek ) { bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); } if ( args->files->nreaders ) { phased_flush(args); while ( args->files->nreaders ) bcf_sr_remove_reader(args->files, 0); } } } else if ( args->files ) // combining overlapping files, using synced reader { while ( bcf_sr_next_line(args->files) ) { for (i=0; i<args->files->nreaders; i++) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; bcf_translate(args->out_hdr, args->files->readers[i].header, line); bcf_write1(args->out_fh, args->out_hdr, line); if ( args->remove_dups ) break; } } } else // concatenating { kstring_t tmp = {0,0,0}; int prev_chr_id = -1, prev_pos; bcf1_t *line = bcf_init(); for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { prev_pos = -1; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } char *end; int pos = strtol(str+1,&end,10) - 1; if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); if ( prev_pos > pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); args->seen_seq[chr_id] = 1; prev_chr_id = chr_id; if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l); } } else { // BCF conversion is required line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) { prev_pos = -1; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } if ( prev_pos > line->pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); args->seen_seq[line->rid] = 1; prev_chr_id = line->rid; if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); } } bcf_hdr_destroy(hdr); hts_close(fp); } bcf_destroy(line); free(tmp.s); } }
int main_vcfgtcheck(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->files = bcf_sr_init(); args->argc = argc; args->argv = argv; set_cwd(args); char *regions = NULL, *targets = NULL; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = { {"GTs-only",1,0,'G'}, {"all-sites",0,0,'a'}, {"homs-only",0,0,'H'}, {"help",0,0,'h'}, {"genotypes",1,0,'g'}, {"plot",1,0,'p'}, {"target-sample",1,0,'S'}, {"query-sample",1,0,'s'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {0,0,0,0} }; char *tmp; while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) { switch (c) { case 'G': args->no_PLs = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg); break; case 'a': args->all_sites = 1; break; case 'H': args->hom_only = 1; break; case 'g': args->gt_fname = optarg; break; case 'p': args->plot = optarg; break; case 'S': args->target_sample = optarg; break; case 's': args->query_sample = optarg; break; case 'r': regions = optarg; break; case 'R': regions = optarg; regions_is_file = 1; break; case 't': targets = optarg; break; case 'T': targets = optarg; targets_is_file = 1; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); } } char *fname = NULL; if ( optind==argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(); // no files given } else fname = argv[optind]; if ( argc>optind+1 ) usage(); // too many files given if ( !args->gt_fname ) args->cross_check = 1; // no genotype file, run in cross-check mode else args->files->require_index = 1; if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions); if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) error("Failed to open %s: %s\n", args->gt_fname,bcf_sr_strerror(args->files->errnum)); args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS; if ( args->plot ) args->plot = init_prefix(args->plot); init_data(args); if ( args->cross_check ) cross_check_gts(args); else check_gt(args); destroy_data(args); bcf_sr_destroy(args->files); if (args->plot) free(args->plot); free(args); return 0; }
static void init_data(args_t *args) { args->files = bcf_sr_init(); args->files->require_index = 1; if ( !bcf_sr_add_reader(args->files,args->fname) ) error("Failed to open %s: %s\n", args->fname, bcf_sr_strerror(args->files->errnum)); args->hdr = args->files->readers[0].header; args->isample = -1; if ( args->sample ) { args->isample = bcf_hdr_id2int(args->hdr,BCF_DT_SAMPLE,args->sample); if ( args->isample<0 ) error("No such sample: %s\n", args->sample); } if ( args->haplotype && args->isample<0 ) { if ( bcf_hdr_nsamples(args->hdr) > 1 ) error("The --sample option is expected with --haplotype\n"); args->isample = 0; } if ( args->mask_fname ) { args->mask = regidx_init(args->mask_fname,NULL,NULL,0,NULL); if ( !args->mask ) error("Failed to initialize mask regions\n"); } // In case we want to store the chains if ( args->chain_fname ) { args->fp_chain = fopen(args->chain_fname,"w"); if ( ! args->fp_chain ) error("Failed to create %s: %s\n", args->chain_fname, strerror(errno)); args->chain_id = 0; } rbuf_init(&args->vcf_rbuf, 100); args->vcf_buf = (bcf1_t**) calloc(args->vcf_rbuf.m, sizeof(bcf1_t*)); if ( args->output_fname ) { args->fp_out = fopen(args->output_fname,"w"); if ( ! args->fp_out ) error("Failed to create %s: %s\n", args->output_fname, strerror(errno)); } else args->fp_out = stdout; }
int main_plugin(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->output_fname = "-"; args->output_type = FT_VCF; args->nplugin_paths = -1; int regions_is_file = 0, targets_is_file = 0, plist_only = 0, usage_only = 0, version_only = 0; if ( argc==1 ) usage(args); char *plugin_name = NULL; if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; } static struct option loptions[] = { {"version",0,0,'V'}, {"verbose",0,0,'v'}, {"help",0,0,'h'}, {"list-plugins",0,0,'l'}, {"output",1,0,'o'}, {"output-type",1,0,'O'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "h?o:O:r:R:t:T:li:e:vV",loptions,NULL)) >= 0) { switch (c) { case 'V': version_only = 1; break; case 'v': args->verbose = 1; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': plist_only = 1; break; case '?': case 'h': usage_only = 1; break; default: error("Unknown argument: %s\n", optarg); } } if ( plist_only ) return list_plugins(args); if ( usage_only && ! plugin_name ) usage(args); load_plugin(args, plugin_name, 1, &args->plugin); if ( version_only ) { const char *bver, *hver; args->plugin.version(&bver, &hver); printf("bcftools %s using htslib %s\n", bcftools_version(), hts_version()); printf("plugin at %s using htslib %s\n\n", bver, hver); return 0; } if ( usage_only ) { if ( args->plugin.usage ) fprintf(stderr,"%s",args->plugin.usage()); else fprintf(stderr,"Usage: bcftools +%s [General Options] -- [Plugin Options]\n",plugin_name); return 0; } if ( args->plugin.run ) { int iopt = optind; optind = 0; int ret = args->plugin.run(argc-iopt, argv+iopt); destroy_data(args); free(args); return ret; } char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(args); args->plugin.argc = argc - optind + 1; args->plugin.argv = argv + optind - 1; } else { fname = argv[optind]; args->plugin.argc = argc - optind; args->plugin.argv = argv + optind; } optind = 0; args->files = bcf_sr_init(); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); args->files->collapse |= COLLAPSE_SOME; } if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); if ( args->filter ) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; if ( !pass ) continue; } line = args->plugin.process(line); if ( line ) bcf_write1(args->out_fh, args->hdr_out, line); } destroy_data(args); bcf_sr_destroy(args->files); free(args); return 0; }
int run(int argc, char **argv) { char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL; memset(&args,0,sizeof(args_t)); args.mode = 0; args.output_fname = "-"; static struct option loptions[] = { {"trio",1,0,'t'}, {"trio-file",1,0,'T'}, {"delete",0,0,'d'}, {"list",1,0,'l'}, {"count",0,0,'c'}, {"rules",1,0,'r'}, {"rules-file",1,0,'R'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0) { switch (c) { case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'R': rules_fname = optarg; break; case 'r': rules_string = optarg; break; case 'd': args.mode |= MODE_DELETE; break; case 'c': args.mode |= MODE_COUNT; break; case 'l': if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; else error("The argument not recognised: --list %s\n", optarg); break; case 't': trio_samples = optarg; break; case 'T': trio_file = optarg; break; case 'h': case '?': default: error("%s",usage()); break; } } if ( rules_fname ) args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args); else args.rules = init_rules(&args, rules_string); if ( !args.rules ) return -1; args.itr = regitr_init(args.rules); args.itr_ori = regitr_init(args.rules); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else error("%s",usage()); } else fname = argv[optind]; if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n"); if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; args.sr = bcf_sr_init(); if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); args.hdr = bcf_sr_get_header(args.sr, 0); args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); bcf_hdr_write(args.out_fh, args.hdr); int i, n = 0; char **list; if ( trio_samples ) { args.ntrios = 1; args.trios = (trio_t*) calloc(1,sizeof(trio_t)); list = hts_readlist(trio_samples, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); args.trios[0].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); for (i=0; i<n; i++) free(list[i]); free(list); } if ( trio_file ) { list = hts_readlist(trio_file, 1, &n); args.ntrios = n; args.trios = (trio_t*) calloc(n,sizeof(trio_t)); for (i=0; i<n; i++) { char *ss = list[i], *se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s: %s\n",trio_file, ss); *se = 0; args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s\n",trio_file); *se = 0; args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; if ( *ss=='\0' ) error("Could not parse %s\n",trio_file); args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss); free(list[i]); } free(list); } while ( bcf_sr_next_line(args.sr) ) { bcf1_t *line = bcf_sr_get_line(args.sr,0); line = process(line); if ( line ) { if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); bcf_write1(args.out_fh, args.hdr, line); } } fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); for (i=0; i<args.ntrios; i++) { trio_t *trio = &args.trios[i]; fprintf(stderr,"%d\t%d\t%d\t%s,%s,%s\n", trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) ); } free(args.gt_arr); free(args.trios); regitr_destroy(args.itr); regitr_destroy(args.itr_ori); regidx_destroy(args.rules); bcf_sr_destroy(args.sr); if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); return 0; }
int main_vcfview(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->clevel = -1; args->print_header = 1; args->update_info = 1; args->output_type = FT_VCF; int targets_is_file = 0, regions_is_file = 0; static struct option loptions[] = { {"genotype",1,0,'g'}, {"compression-level",1,0,'l'}, {"header-only",0,0,'h'}, {"no-header",0,0,'H'}, {"exclude",1,0,'e'}, {"include",1,0,'i'}, {"trim-alt-alleles",0,0,'a'}, {"no-update",0,0,'I'}, {"drop-genotypes",0,0,'G'}, {"private",0,0,'x'}, {"exclude-private",0,0,'X'}, {"uncalled",0,0,'u'}, {"exclude-uncalled",0,0,'U'}, {"apply-filters",1,0,'f'}, {"known",0,0,'k'}, {"novel",0,0,'n'}, {"min-alleles",1,0,'m'}, {"max-alleles",1,0,'M'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"force-samples",0,0,1}, {"output-type",1,0,'O'}, {"output-file",1,0,'o'}, {"types",1,0,'v'}, {"exclude-types",1,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"min-ac",1,0,'c'}, {"max-ac",1,0,'C'}, {"min-af",1,0,'q'}, {"max-af",1,0,'Q'}, {"phased",0,0,'p'}, {"exclude-phased",0,0,'P'}, {0,0,0,0} }; char *tmp; while ((c = getopt_long(argc, argv, "l:t:T:r:R:o:O:s:S:Gf:knv:V:m:M:auUhHc:C:Ii:e:xXpPq:Q:g:",loptions,NULL)) >= 0) { char allele_type[8] = "nref"; switch (c) { case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'l': args->clevel = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --compression-level %s\n", optarg); args->output_type |= FT_GZ; break; case 'o': args->fn_out = optarg; break; case 'H': args->print_header = 0; break; case 'h': args->header_only = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 's': args->sample_names = optarg; break; case 'S': args->sample_names = optarg; args->sample_is_file = 1; break; case 1 : args->force_samples = 1; break; case 'a': args->trim_alts = 1; args->calc_ac = 1; break; case 'I': args->update_info = 0; break; case 'G': args->sites_only = 1; break; case 'f': args->files->apply_filters = optarg; break; case 'k': args->known = 1; break; case 'n': args->novel = 1; break; case 'm': args->min_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --min-alleles %s\n", optarg); break; case 'M': args->max_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --max-alleles %s\n", optarg); break; case 'v': args->include_types = optarg; break; case 'V': args->exclude_types = optarg; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'c': { args->min_ac_type = ALLELE_NONREF; if ( sscanf(optarg,"%d:%s",&args->min_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->min_ac)!=1 ) error("Error: Could not parse --min-ac %s\n", optarg); set_allele_type(&args->min_ac_type, allele_type); args->calc_ac = 1; break; } case 'C': { args->max_ac_type = ALLELE_NONREF; if ( sscanf(optarg,"%d:%s",&args->max_ac, allele_type)!=2 && sscanf(optarg,"%d",&args->max_ac)!=1 ) error("Error: Could not parse --max-ac %s\n", optarg); set_allele_type(&args->max_ac_type, allele_type); args->calc_ac = 1; break; } case 'q': { args->min_af_type = ALLELE_NONREF; if ( sscanf(optarg,"%f:%s",&args->min_af, allele_type)!=2 && sscanf(optarg,"%f",&args->min_af)!=1 ) error("Error: Could not parse --min_af %s\n", optarg); set_allele_type(&args->min_af_type, allele_type); args->calc_ac = 1; break; } case 'Q': { args->max_af_type = ALLELE_NONREF; if ( sscanf(optarg,"%f:%s",&args->max_af, allele_type)!=2 && sscanf(optarg,"%f",&args->max_af)!=1 ) error("Error: Could not parse --min_af %s\n", optarg); set_allele_type(&args->max_af_type, allele_type); args->calc_ac = 1; break; } case 'x': args->private_vars |= FLT_INCLUDE; args->calc_ac = 1; break; case 'X': args->private_vars |= FLT_EXCLUDE; args->calc_ac = 1; break; case 'u': args->uncalled |= FLT_INCLUDE; args->calc_ac = 1; break; case 'U': args->uncalled |= FLT_EXCLUDE; args->calc_ac = 1; break; case 'p': args->phased |= FLT_INCLUDE; break; // phased case 'P': args->phased |= FLT_EXCLUDE; break; // exclude-phased case 'g': { if ( !strcasecmp(optarg,"hom") ) args->gt_type = GT_NEED_HOM; else if ( !strcasecmp(optarg,"het") ) args->gt_type = GT_NEED_HET; else if ( !strcasecmp(optarg,"miss") ) args->gt_type = GT_NEED_MISSING; else if ( !strcasecmp(optarg,"^hom") ) args->gt_type = GT_NO_HOM; else if ( !strcasecmp(optarg,"^het") ) args->gt_type = GT_NO_HET; else if ( !strcasecmp(optarg,"^miss") ) args->gt_type = GT_NO_MISSING; else error("The argument to -g not recognised. Expected one of hom/het/miss/^hom/^het/^miss, got \"%s\".\n", optarg); break; } case '?': usage(args); default: error("Unknown argument: %s\n", optarg); } } if ( args->filter_logic == (FLT_EXCLUDE|FLT_INCLUDE) ) error("Only one of -i or -e can be given.\n"); if ( args->private_vars > FLT_EXCLUDE ) error("Only one of -x or -X can be given.\n"); if ( args->uncalled > FLT_EXCLUDE ) error("Only one of -u or -U can be given.\n"); if ( args->phased > FLT_EXCLUDE ) error("Only one of -p or -P can be given.\n"); if ( args->sample_names && args->update_info) args->calc_ac = 1; char *fname = NULL; if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(args); } else fname = argv[optind]; // read in the regions from the command line if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } else if ( optind+1 < argc ) { int i; kstring_t tmp = {0,0,0}; kputs(argv[optind+1],&tmp); for (i=optind+2; i<argc; i++) { kputc(',',&tmp); kputs(argv[i],&tmp); } if ( bcf_sr_set_regions(args->files, tmp.s, 0)<0 ) error("Failed to read the regions: %s\n", tmp.s); free(tmp.s); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); init_data(args); bcf_hdr_t *out_hdr = args->hnull ? args->hnull : (args->hsub ? args->hsub : args->hdr); if (args->print_header) bcf_hdr_write(args->out, out_hdr); else if ( args->output_type & FT_BCF ) error("BCF output requires header, cannot proceed with -H\n"); if (!args->header_only) { while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = args->files->readers[0].buffer[0]; if ( line->errcode && out_hdr!=args->hdr ) error("Undefined tags in the header, cannot proceed in the sample subset mode.\n"); if ( subset_vcf(args, line) ) bcf_write1(args->out, out_hdr, line); } } hts_close(args->out); destroy_data(args); bcf_sr_destroy(args->files); free(args); return 0; }
int main_vcfroh(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->t2AZ = 6.7e-8; args->t2HW = 5e-9; args->rec_rate = 0; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = { {"AF-tag",1,0,0}, {"AF-file",1,0,1}, {"AF-dflt",1,0,2}, {"estimate-AF",1,0,'e'}, {"GTs-only",1,0,'G'}, {"sample",1,0,'s'}, {"hw-to-az",1,0,'a'}, {"az-to-hw",1,0,'H'}, {"viterbi-training",0,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"genetic-map",1,0,'m'}, {"rec-rate",1,0,'M'}, {"skip-indels",0,0,'I'}, {0,0,0,0} }; int naf_opts = 0; char *tmp; while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) { switch (c) { case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; case 2: args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; case 'e': args->estimate_AF = optarg; naf_opts++; break; case 'I': args->snps_only = 1; break; case 'G': args->fake_PLs = 1; args->unseen_PL = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -G %s\n", optarg); args->unseen_PL = pow(10,-args->unseen_PL/10.); break; case 'm': args->genmap_fname = optarg; break; case 'M': args->rec_rate = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -M %s\n", optarg); break; case 's': args->sample = strdup(optarg); break; case 'a': args->t2AZ = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -a %s\n", optarg); break; case 'H': args->t2HW = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -H %s\n", optarg); break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 'V': args->vi_training = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } } if ( argc<optind+1 ) usage(args); if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ); if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW); if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n"); if ( args->af_fname && args->targets_list ) error("Error: The options --AF-file and -t are mutually exclusive\n"); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( args->af_fname ) { if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 ) error("Failed to read the targets: %s\n", args->af_fname); } if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum)); init_data(args); while ( bcf_sr_next_line(args->files) ) { vcfroh(args, args->files->readers[0].buffer[0]); } vcfroh(args, NULL); fprintf(pysamerr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); destroy_data(args); free(args); return 0; }
int main_vcfquery(int argc, char *argv[]) { int c, collapse = 0; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = { {"help",0,0,'h'}, {"list-samples",0,0,'l'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"format",1,0,'f'}, {"output-file",1,0,'o'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"annots",1,0,'a'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"print-header",0,0,'H'}, {"collapse",1,0,'c'}, {"vcf-list",1,0,'v'}, {"allow-undef-tags",0,0,'u'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) { switch (c) { case 'o': args->fn_out = optarg; break; case 'f': args->format_str = strdup(optarg); break; case 'H': args->print_header = 1; break; case 'v': args->vcf_list = optarg; break; case 'c': if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; case 'a': { kstring_t str = {0,0,0}; kputs("%CHROM\t%POS\t%MASK\t%REF\t%ALT\t%", &str); char *p = optarg; while ( *p ) { if ( *p==',' ) kputs("\t%", &str); else kputc(*p, &str); p++; } kputc('\n', &str); args->format_str = str.s; break; } case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': args->list_columns = 1; break; case 'u': args->allow_undef_tags = 1; break; case 's': args->sample_list = optarg; break; case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); } } char *fname = NULL; if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; } else fname = argv[optind]; if ( args->list_columns ) { if ( !fname ) error("Missing the VCF file name\n"); args->files = bcf_sr_init(); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); list_columns(args); bcf_sr_destroy(args->files); free(args); return 0; } if ( !args->format_str ) usage(); args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout; if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( !args->vcf_list ) { if ( !fname ) usage(); args->files = bcf_sr_init(); args->files->collapse = collapse; if ( optind+1 < argc ) args->files->require_index = 1; if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } while ( fname ) { if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); fname = ++optind < argc ? argv[optind] : NULL; } init_data(args); query_vcf(args); free(args->format_str); destroy_data(args); bcf_sr_destroy(args->files); fclose(args->out); free(args); return 0; } // multiple VCFs int i, k, nfiles, prev_nsamples = 0; char **fnames, **prev_samples = NULL; fnames = hts_readlist(args->vcf_list, 1, &nfiles); if ( !nfiles ) error("No files in %s?\n", args->vcf_list); for (i=0; i<nfiles; i++) { args->files = bcf_sr_init(); args->files->collapse = collapse; if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); if ( optind < argc ) args->files->require_index = 1; if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(args->files, fnames[i]) ) error("Failed to open %s: %s\n", fnames[i],bcf_sr_strerror(args->files->errnum)); for (k=optind; k<argc; k++) if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum)); init_data(args); if ( i==0 ) prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header)); else { args->print_header = 0; if ( compare_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header), prev_samples, prev_nsamples) ) error("Different samples in %s and %s\n", fnames[i-1],fnames[i]); } query_vcf(args); destroy_data(args); bcf_sr_destroy(args->files); } fclose(args->out); destroy_list(fnames, nfiles); destroy_list(prev_samples, prev_nsamples); free(args->format_str); free(args); return 0; }
int run(int argc, char **argv) { args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->nsites = 10; args->min_hets = 0.3; args->background = "X:60001-2699520"; static struct option loptions[] = { {"verbose",1,0,'v'}, {"ploidy",1,0,'p'}, {"nsites",1,0,'n'}, {"guess",1,0,'g'}, {"min-hets",1,0,'m'}, {"background",1,0,'b'}, {0,0,0,0} }; char c, *tmp, *ploidy_fname = NULL; while ((c = getopt_long(argc, argv, "p:n:g:m:vb:",loptions,NULL)) >= 0) { switch (c) { case 'b': if ( !strcmp("-",optarg) ) args->background = NULL; else args->background = optarg; break; case 'v': args->verbose = 1; break; case 'g': if ( !strcasecmp(optarg,"GT") ) args->guess = GUESS_GT; else if ( !strcasecmp(optarg,"PL") ) args->guess = GUESS_PL; else if ( !strcasecmp(optarg,"GL") ) args->guess = GUESS_GL; else error("The argument not recognised, expected --guess GT, --guess PL or --guess GL: %s\n", optarg); break; case 'm': args->min_hets = strtod(optarg,&tmp); if ( *tmp ) error("Unexpected argument to --min-hets: %s\n", optarg); break; case 'p': ploidy_fname = optarg; break; case 'n': args->nsites = strtol(optarg,&tmp,10); if (*tmp) error("Unexpected argument to --nsites: %s\n", optarg); break; case 'h': case '?': default: error("%s", usage()); break; } } args->sr = bcf_sr_init(); args->sr->require_index = 1; if ( !argv[0] ) error("%s", usage()); if ( !bcf_sr_add_reader(args->sr,argv[0]) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); args->hdr = args->sr->readers[0].header; args->nsample = bcf_hdr_nsamples(args->hdr); args->dflt_ploidy = 2; if ( ploidy_fname ) { args->ploidy = ploidy_init(ploidy_fname, args->dflt_ploidy); if ( !args->ploidy ) error("Could not read %s\n", ploidy_fname); } else { args->ploidy = ploidy_init_string( "X 1 60000 M 1\n" "X 2699521 154931043 M 1\n" "Y 1 59373566 M 1\n" "Y 1 59373566 F 0\n", args->dflt_ploidy); } args->nsex = ploidy_nsex(args->ploidy); args->sex2ploidy = (int*) malloc(sizeof(int)*args->nsex); args->max_ploidy = ploidy_max(args->ploidy); if ( args->guess && args->max_ploidy > 2 ) error("Sorry, ploidy %d not supported with -g\n", args->max_ploidy); args->ncounts = args->nsample * ((args->max_ploidy>2 ? args->max_ploidy : 2)+1); args->counts = (int*) malloc(sizeof(int)*args->ncounts); args->bg_counts = (count_t*) calloc(args->nsample,sizeof(count_t)); args->sex2prob = (float*) calloc(args->nsample*args->nsex,sizeof(float)); int i, nseq; for (i=0; i<args->nsample*args->nsex; i++) args->sex2prob[i] = 1; if ( args->verbose && args->guess ) printf("# [1]REG\t[2]Region\t[3]Sample\t[4]Het fraction\t[5]nHet\t[6]nHom\t[7]nMissing\n"); // First get the counts from expected haploid regions regidx_t *idx = ploidy_regions(args->ploidy); char **seqs = regidx_seq_names(idx, &nseq); for (i=0; i<nseq; i++) { regitr_t itr; regidx_overlap(idx, seqs[i], 0, UINT32_MAX, &itr); while ( itr.i < itr.n ) { if ( args->guess ) itr.i += process_region_guess(args, seqs[i], &itr); else itr.i += process_region_precise(args, seqs[i], &itr); } } // Get the counts from a PAR (the background diploid region) and see if the fraction // of hets is different if ( args->guess ) sex2prob_guess(args); for (i=0; i<args->nsample; i++) { int j, jmax = 0; float max = 0, sum = 0; for (j=0; j<args->nsex; j++) { sum += args->sex2prob[i*args->nsex+j]; if ( max < args->sex2prob[i*args->nsex+j] ) { jmax = j; max = args->sex2prob[i*args->nsex+j]; } } if ( args->verbose ) printf("%s\t%s\t%f\n", args->hdr->samples[i],ploidy_id2sex(args->ploidy,jmax),args->sex2prob[i*args->nsex+jmax]/sum); else printf("%s\t%s\n", args->hdr->samples[i],ploidy_id2sex(args->ploidy,jmax)); } bcf_sr_destroy(args->sr); ploidy_destroy(args->ploidy); destroy_regs(args); free(args->sex2ploidy); free(args->counts); free(args->bg_counts); free(args->gts); free(args->pls); free(args->sex2prob); free(args); return 0; }
static void init_data(args_t *args) { args->aux.srs = bcf_sr_init(); // Open files for input and output, initialize structures if ( args->targets ) { if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) error("Failed to read the targets: %s\n", args->targets); if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) { args->aux.srs->targets->missed_reg_handler = print_missed_line; args->aux.srs->targets->missed_reg_data = args; } } if ( args->regions ) { if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) error("Failed to read the targets: %s\n", args->regions); } if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); args->aux.hdr = bcf_sr_get_header(args->aux.srs,0); int i; if ( args->samples_fname ) { set_samples(args, args->samples_fname, args->samples_is_file); if ( args->aux.flag&CALL_CONSTR_TRIO ) { if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname); fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams); } args->nsex = ploidy_nsex(args->ploidy); args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int)); args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int)); args->aux.ploidy = (uint8_t*) malloc(args->nsamples); for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2; for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2; } if ( args->samples_map ) { args->aux.hdr = bcf_hdr_subset(bcf_sr_get_header(args->aux.srs,0), args->nsamples, args->samples, args->samples_map); if ( !args->aux.hdr ) error("Error occurred while subsetting samples\n"); for (i=0; i<args->nsamples; i++) if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]); if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n"); } else { args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0)); for (i=0; i<args->nsamples; i++) if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) error("No such sample: %s\n", args->samples[i]); } args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->flag & CF_QCALL ) return; if ( args->flag & CF_MCALL ) mcall_init(&args->aux); if ( args->flag & CF_CCALL ) ccall_init(&args->aux); if ( args->flag&CF_GVCF ) { bcf_hdr_append(args->aux.hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); args->gvcf.rid = -1; args->gvcf.line = bcf_init1(); args->gvcf.gt = (int32_t*) malloc(2*sizeof(int32_t)*bcf_hdr_nsamples(args->aux.hdr)); for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) { args->gvcf.gt[2*i+0] = bcf_gt_unphased(0); args->gvcf.gt[2*i+1] = bcf_gt_unphased(0); } } bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS"); bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); bcf_hdr_write(args->out_fh, args->aux.hdr); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); }