int process_region_precise(args_t *args, char *seq, regitr_t *itr) { int k = 1; uint32_t start = itr->reg[itr->i].start, end = itr->reg[itr->i].end; while ( itr->i+k<itr->n && start==itr->reg[itr->i+k].start && end==itr->reg[itr->i+k].end ) k++; int ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, NULL, NULL); assert(ret); memset(args->counts,0,args->ncounts*sizeof(int)); // Select 'nsites' sites spaced so that they evenly cover the whole region // to get a representative sample. We index-jump as we should be checking // a few sites only. int i, rid = -1, pos, prev_pos = -1, ismpl; for (i=0; i<args->nsites; i++) { rid = -1; pos = ((i+1.0)/(args->nsites+1))*(end - start) + start; if ( i>0 && pos <= prev_pos ) continue; // the vcf is too sparse if ( bcf_sr_seek(args->sr,seq,pos)!=0 ) return k; // sequence not present if ( !bcf_sr_next_line(args->sr) ) return k; // no sites found bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rid==-1 ) rid = rec->rid; if ( rid!=rec->rid || rec->pos > end ) break; prev_pos = rec->pos; int ngts = bcf_get_genotypes(args->hdr,rec,&args->gts,&args->ngts); ngts /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *gts = args->gts + ngts*ismpl; int igt, ploidy = 0; for (igt=0; igt<ngts; igt++) { if ( gts[igt]==bcf_int32_vector_end || bcf_gt_is_missing(gts[igt]) ) break; else ploidy++; } args->counts[ismpl*(args->max_ploidy+1) + ploidy]++; if ( args->verbose ) fprintf(stderr,"%s:%d\t%s\tploidy=%d\n", seq,rec->pos+1,args->hdr->samples[ismpl],ploidy); } } for (ismpl=0; ismpl<args->nsample; ismpl++) { float sum = 0, *probs = args->sex2prob + ismpl*args->nsex; int *counts = args->counts + ismpl*(args->max_ploidy+1); for (i=0; i<args->max_ploidy+1; i++) sum += counts[i]; if ( !sum ) continue; for (i=0; i<args->nsex; i++) { int ploidy = args->sex2ploidy[i]; probs[i] *= counts[ploidy]/sum; } } return k; }
void isec_vcf(args_t *args) { bcf_srs_t *files = args->files; kstring_t str = {0,0,0}; htsFile *out_fh = NULL; // When only one VCF is output, print VCF to pysam_stdout or -o file int out_std = 0; if ( args->nwrite==1 && !args->prefix ) out_std = 1; if ( args->targets_list && files->nreaders==1 ) out_std = 1; if ( out_std ) { out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type)); if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno)); if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads); if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec"); bcf_hdr_write(out_fh, files->readers[args->iwrite].header); } if ( !args->nwrite && !out_std && !args->prefix ) fprintf(pysam_stderr,"Note: -w option not given, printing list of sites...\n"); int n; while ( (n=bcf_sr_next_line(files)) ) { bcf_sr_t *reader = NULL; bcf1_t *line = NULL; int i, ret = 0; for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->nflt && args->flt[i] ) { bcf1_t *rec = bcf_sr_get_line(files, i); int pass = filter_test(args->flt[i], rec, NULL); if ( args->flt_logic[i] & FLT_EXCLUDE ) pass = pass ? 0 : 1; if ( !pass ) { files->has_line[i] = 0; n--; continue; } } if ( !line ) { line = files->readers[i].buffer[0]; reader = &files->readers[i]; } ret |= 1<<i; // this may overflow for many files, but will be used only with two (OP_VENN) } switch (args->isec_op) { case OP_COMPLEMENT: if ( n!=1 || !bcf_sr_has_line(files,0) ) continue; break; case OP_EQUAL: if ( n != args->isec_n ) continue; break; case OP_PLUS: if ( n < args->isec_n ) continue; break; case OP_MINUS: if ( n > args->isec_n ) continue; break; case OP_EXACT: for (i=0; i<files->nreaders; i++) if ( files->has_line[i] != args->isec_exact[i] ) break; if ( i<files->nreaders ) continue; break; } if ( out_std ) { if ( bcf_sr_has_line(files,args->iwrite) ) bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]); continue; } else if ( args->fh_sites ) { str.l = 0; kputs(reader->header->id[BCF_DT_CTG][line->rid].key, &str); kputc('\t', &str); kputw(line->pos+1, &str); kputc('\t', &str); if (line->n_allele > 0) kputs(line->d.allele[0], &str); else kputc('.', &str); kputc('\t', &str); if (line->n_allele > 1) kputs(line->d.allele[1], &str); else kputc('.', &str); for (i=2; i<line->n_allele; i++) { kputc(',', &str); kputs(line->d.allele[i], &str); } kputc('\t', &str); for (i=0; i<files->nreaders; i++) kputc(bcf_sr_has_line(files,i)?'1':'0', &str); kputc('\n', &str); fwrite(str.s,sizeof(char),str.l,args->fh_sites); } if ( args->prefix ) { if ( args->isec_op==OP_VENN && ret==3 ) { if ( !args->nwrite || args->write[0] ) bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0)); if ( !args->nwrite || args->write[1] ) bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1)); } else { for (i=0; i<files->nreaders; i++) { if ( !bcf_sr_has_line(files,i) ) continue; if ( args->write && !args->write[i] ) continue; bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]); } } } } if ( str.s ) free(str.s); if ( out_fh ) hts_close(out_fh); }
static void init_data(args_t *args) { bcf_srs_t *files = bcf_sr_init(); if ( args->regions_list ) { if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum)); bcf_hdr_t *hdr = files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = hdr->samples[0]; } else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample); int ret = bcf_hdr_set_samples(hdr, args->sample, 0); if ( ret<0 ) error("Error setting the sample: %s\n", args->sample); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) ) error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname); int i; args->xvals = (double*) calloc(args->nbins,sizeof(double)); for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1); // collect BAF distributions for all chromosomes int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1; float *baf = NULL; while ( bcf_sr_next_line(files) ) { ntotal++; bcf1_t *line = bcf_sr_get_line(files,0); if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue; if ( bcf_float_is_missing(baf[0]) ) continue; nprocessed++; if ( prev_chr==-1 || prev_chr!=line->rid ) { // new chromosome idist = args->ndist++; args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist); memset(&args->dist[idist],0,sizeof(dist_t)); args->dist[idist].chr = strdup(bcf_seqname(hdr,line)); args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double)); args->dist[idist].xvals = args->xvals; args->dist[idist].nvals = args->nbins; prev_chr = line->rid; } int bin = baf[0]*(args->nbins-1); args->dist[idist].yvals[bin]++; // the distribution } free(baf); bcf_sr_destroy(files); for (idist=0; idist<args->ndist; idist++) { #if 0 int j; for (j=0; j<args->nbins; j++) { double x = args->dist[idist].xvals[j]; args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3); } #endif init_dist(args, &args->dist[idist],args->verbose); } args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir); fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version()); fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]); for (i=1; i<args->argc; i++) fprintf(args->dat_fp, " %s",args->argv[i]); fprintf(args->dat_fp,"\n#\n"); fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n"); fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n"); fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n"); char *fname = NULL; FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir); //-------- matplotlib script -------------- fprintf(fp, "#!/usr/bin/env python\n" "#\n" "import matplotlib as mpl\n" "mpl.use('Agg')\n" "import matplotlib.pyplot as plt\n" "import csv,sys,argparse\n" "from math import exp\n" "\n" "outdir = '%s'\n" "\n" "def read_dat(dat,fit,cn):\n" " csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n" " with open(outdir+'/dist.dat', 'rb') as f:\n" " reader = csv.reader(f, 'tab')\n" " for row in reader:\n" " if row[0][0]=='#': continue\n" " type = row[0]\n" " chr = row[1]\n" " if type=='DIST':\n" " if chr not in dat: dat[chr] = []\n" " dat[chr].append(row)\n" " elif type=='FIT':\n" " if chr not in fit: fit[chr] = []\n" " fit[chr].append(row)\n" " elif type=='CN':\n" " cn[chr] = row[2]\n" "\n" "def plot_dist(dat,fit,chr):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n" " if chr in fit:\n" " for i in range(len(fit[chr])):\n" " pfit = fit[chr][i]\n" " exec('def xfit(x): return '+pfit[5])\n" " istart = int(pfit[3])\n" " iend = int(pfit[4])+1\n" " vals = dat[chr][istart:iend]\n" " args = {}\n" " if i==0: args = {'label':'Target to Fit'}\n" " ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n" " if i==0: args = {'label':'Best Fit'}\n" " ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n" " ax.set_title('BAF distribution, chr'+chr)\n" " ax.set_xlabel('BAF')\n" " ax.set_ylabel('Frequency')\n" " ax.legend(loc='best',prop={'size':7},frameon=False)\n" " plt.savefig(outdir+'/dist.chr'+chr+'.png')\n" " plt.close()\n" "\n" "def plot_copy_number(cn):\n" " fig, ax = plt.subplots(1, 1, figsize=(7,5))\n" " xlabels = sorted(cn.keys())\n" " xvals = range(len(xlabels))\n" " yvals = [float(cn[x]) for x in xlabels]\n" " ax.plot(xvals,yvals,'o',color='red')\n" " for i in range(len(xvals)):\n" " if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n" " ax.tick_params(axis='both', which='major', labelsize=9)\n" " ax.set_xticks(xvals)\n" " ax.set_xticklabels(xlabels,rotation=45)\n" " ax.set_xlim(-1,len(xlabels))\n" " ax.set_ylim(0,5.0)\n" " ax.set_yticks([1.0,2.0,3.0,4.0])\n" " ax.set_xlabel('Chromosome')\n" " ax.set_ylabel('Copy Number')\n" " plt.savefig(outdir+'/copy-number.png')\n" " plt.close()\n" "\n" "class myParser(argparse.ArgumentParser):\n" " def error(self, message):\n" " self.print_help()\n" " sys.stderr.write('error: %%s\\n' %% message)\n" " sys.exit(2)\n" "\n" "def main():\n" " parser = myParser()\n" " parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n" " parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n" " parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n" " args = parser.parse_args()\n" " dat = {}; fit = {}; cn = {}\n" " read_dat(dat,fit,cn)\n" " if args.distrib!=None:\n" " plot_dist(dat,fit,args.distrib)\n" " if args.all:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" " plot_copy_number(cn)\n" " elif args.copy_number:\n" " plot_copy_number(cn)\n" " else:\n" " for chr in dat: plot_dist(dat,fit,chr)\n" "\n" "if __name__ == '__main__':\n" " main()\n", args->output_dir); //--------------------------------------- chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH); free(fname); fclose(fp); }
static void concat(args_t *args) { int i; if ( args->phased_concat ) // phased concat { // keep only two open files at a time while ( args->ifname < args->nfnames ) { int new_file = 0; while ( args->files->nreaders < 2 && args->ifname < args->nfnames ) { if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); new_file = 1; args->ifname++; if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position int seek_pos = -1; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line)); } else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start int nret; while ( (nret = bcf_sr_next_line(args->files)) ) { if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( ! bcf_sr_region_done(args->files,0) ) continue; phased_flush(args); bcf_sr_remove_reader(args->files, 0); } // Get a line to learn about current position for (i=0; i<args->files->nreaders; i++) if ( bcf_sr_has_line(args->files,i) ) break; bcf1_t *line = bcf_sr_get_line(args->files,i); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); args->ifname++; } if ( must_seek ) { bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); } if ( args->files->nreaders ) { phased_flush(args); while ( args->files->nreaders ) bcf_sr_remove_reader(args->files, 0); } } } else if ( args->files ) // combining overlapping files, using synced reader { while ( bcf_sr_next_line(args->files) ) { for (i=0; i<args->files->nreaders; i++) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; bcf_translate(args->out_hdr, args->files->readers[i].header, line); bcf_write1(args->out_fh, args->out_hdr, line); if ( args->remove_dups ) break; } } } else // concatenating { kstring_t tmp = {0,0,0}; int prev_chr_id = -1, prev_pos; bcf1_t *line = bcf_init(); for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { prev_pos = -1; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } char *end; int pos = strtol(str+1,&end,10) - 1; if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); if ( prev_pos > pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); args->seen_seq[chr_id] = 1; prev_chr_id = chr_id; if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l); } } else { // BCF conversion is required line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) { prev_pos = -1; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } if ( prev_pos > line->pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); args->seen_seq[line->rid] = 1; prev_chr_id = line->rid; if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); } } bcf_hdr_destroy(hdr); hts_close(fp); } bcf_destroy(line); free(tmp.s); } }
void union_data::readGenotypesVCF(string fvcf,string region) { int n_includedG = 0; int n_excludedG_mult = 0; int n_excludedG_void = 0; int n_excludedG_user = 0; int n_includedS = 0; vector < int > mappingS; genotype_id.clear(); genotype_chr.clear(); genotype_start.clear(); genotype_end.clear(); genotype_val.clear(); genotype_count=0; genotype_id_to_idx.clear(); //Opening files bcf_srs_t * sr = bcf_sr_init(); //vrb.bullet("target region [" + regionGenotype.get() + "]"); //if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!"); bcf_sr_set_regions(sr, region.c_str(), 0); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i0 = 0 ; i0 < n_samples ; i0 ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i0]))); if (mappingS.back() >= 0) n_includedS++; } //Read genotype data int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL; float * ds_arr = NULL; bcf1_t * line; unsigned int linecount = 0; while(bcf_sr_next_line (sr)) { linecount ++; if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines"); line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr); if (nds == n_samples || ngt == 2*n_samples) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); if (filter_genotype.check(sid)) { genotype_id.push_back(sid); genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid))); string genotype_ref = string(line->d.allele[0]); genotype_start.push_back(line->pos + 1); nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr); if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]); else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1); genotype_val.push_back(vector < float > (sample_count, 0.0)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i]; else { if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing; else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); } } } pair < string, int > temp (sid,n_includedG); genotype_id_to_idx.insert(temp); n_includedG++; } else n_excludedG_user ++; } else n_excludedG_void ++; } else n_excludedG_mult ++; } //Finalize free(gt_arr); free(ds_arr); bcf_sr_destroy(sr); genotype_count = n_includedG; //vrb.bullet(stb.str(n_includedG) + " variants included"); //if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user"); //if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded"); //if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]"); //if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!"); }
int main_plugin(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->output_fname = "-"; args->output_type = FT_VCF; args->nplugin_paths = -1; int regions_is_file = 0, targets_is_file = 0, plist_only = 0; if ( argc==1 ) usage(args); char *plugin_name = NULL; if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; } static struct option loptions[] = { {"verbose",0,0,'v'}, {"help",0,0,'h'}, {"list-plugins",0,0,'l'}, {"output",1,0,'o'}, {"output-type",1,0,'O'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "h?o:O:r:R:li:e:v",loptions,NULL)) >= 0) { switch (c) { case 'v': args->verbose = 1; break; case 'o': args->output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': plist_only = 1; break; case '?': case 'h': load_plugin(args, plugin_name, 1, &args->plugin); fprintf(stderr,"%s",args->plugin.usage()); return 0; break; default: error("Unknown argument: %s\n", optarg); } } if ( plist_only ) return list_plugins(args); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(args); args->plugin.argc = argc - optind + 1; args->plugin.argv = argv + optind - 1; } else { fname = argv[optind]; args->plugin.argc = argc - optind; args->plugin.argv = argv + optind; } optind = 0; args->plugin.argv[0] = plugin_name; load_plugin(args, plugin_name, 1, &args->plugin); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); args->files->collapse |= COLLAPSE_SOME; } if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open or the file not indexed: %s\n", fname); init_data(args); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); if ( args->filter ) { int pass = filter_test(args->filter, line, NULL); if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1; if ( !pass ) continue; } line = args->plugin.process(line); if ( line ) bcf_write1(args->out_fh, args->hdr_out, line); } destroy_data(args); bcf_sr_destroy(args->files); free(args); return 0; }
int run(int argc, char **argv) { char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL; memset(&args,0,sizeof(args_t)); args.mode = 0; args.output_fname = "-"; static struct option loptions[] = { {"trio",1,0,'t'}, {"trio-file",1,0,'T'}, {"delete",0,0,'d'}, {"list",1,0,'l'}, {"count",0,0,'c'}, {"rules",1,0,'r'}, {"rules-file",1,0,'R'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0) { switch (c) { case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'R': rules_fname = optarg; break; case 'r': rules_string = optarg; break; case 'd': args.mode |= MODE_DELETE; break; case 'c': args.mode |= MODE_COUNT; break; case 'l': if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; else error("The argument not recognised: --list %s\n", optarg); break; case 't': trio_samples = optarg; break; case 'T': trio_file = optarg; break; case 'h': case '?': default: error("%s",usage()); break; } } if ( rules_fname ) args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args); else args.rules = init_rules(&args, rules_string); if ( !args.rules ) return -1; args.itr = regitr_init(args.rules); args.itr_ori = regitr_init(args.rules); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else error("%s",usage()); } else fname = argv[optind]; if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n"); if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; args.sr = bcf_sr_init(); if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); args.hdr = bcf_sr_get_header(args.sr, 0); args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); bcf_hdr_write(args.out_fh, args.hdr); int i, n = 0; char **list; if ( trio_samples ) { args.ntrios = 1; args.trios = (trio_t*) calloc(1,sizeof(trio_t)); list = hts_readlist(trio_samples, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); args.trios[0].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); for (i=0; i<n; i++) free(list[i]); free(list); } if ( trio_file ) { list = hts_readlist(trio_file, 1, &n); args.ntrios = n; args.trios = (trio_t*) calloc(n,sizeof(trio_t)); for (i=0; i<n; i++) { char *ss = list[i], *se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s: %s\n",trio_file, ss); *se = 0; args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s\n",trio_file); *se = 0; args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; if ( *ss=='\0' ) error("Could not parse %s\n",trio_file); args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss); free(list[i]); } free(list); } while ( bcf_sr_next_line(args.sr) ) { bcf1_t *line = bcf_sr_get_line(args.sr,0); line = process(line); if ( line ) { if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); bcf_write1(args.out_fh, args.hdr, line); } } fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); for (i=0; i<args.ntrios; i++) { trio_t *trio = &args.trios[i]; fprintf(stderr,"%d\t%d\t%d\t%s,%s,%s\n", trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) ); } free(args.gt_arr); free(args.trios); regitr_destroy(args.itr); regitr_destroy(args.itr_ori); regidx_destroy(args.rules); bcf_sr_destroy(args.sr); if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); return 0; }
void genrich_data::readReferenceGenotypes(string fvcf) { vector < int > mappingS; //Opening files vrb.title("Reading variant list in [" + fvcf + "] MAF=" + stb.str(threshold_maf)); bcf_srs_t * sr = bcf_sr_init(); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int included_sample = 0; int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i = 0 ; i < n_samples ; i ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i]))); if (mappingS.back() >= 0) included_sample ++; } vrb.bullet("#samples = " + stb.str(included_sample)); //Variant processing unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0, n_excludedV_toofar = 0; int ngt, ngt_arr = 0, *gt_arr = NULL; bcf1_t * line; while(bcf_sr_next_line (sr)) { line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid)); int chr_idx = findCHR(chr); if (chr_idx >= 0) { unsigned int pos = line->pos + 1; ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); if (ngt == 2*n_samples) { double freq = 0.0, tot = 0.0; for(int i = 0 ; i < n_samples ; i ++) { assert(gt_arr[2*i+0] != bcf_gt_missing && gt_arr[2*i+1] != bcf_gt_missing); if (mappingS[i] >= 0) { freq += bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); tot += 2.0; } } double maf = freq / tot; if (maf > 0.5) maf = 1.0 - maf; if (maf >= threshold_maf) { int dist_tss = getDistance(chr_idx, pos); if (dist_tss < 1e6) { string tmp_id = chr + "_" + stb.str(pos); genotype_uuid.insert(pair < string, unsigned int > (tmp_id, genotype_pos.size())); genotype_chr.push_back(chr_idx); genotype_pos.push_back(pos); genotype_maf.push_back(maf); genotype_dist.push_back(dist_tss); genotype_haps.push_back(vector < bool > (2 * included_sample, false)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { genotype_haps.back()[2 * mappingS[i] + 0] = bcf_gt_allele(gt_arr[2 * i + 0]); genotype_haps.back()[2 * mappingS[i] + 1] = bcf_gt_allele(gt_arr[2 * i + 1]); } } } else n_excludedV_toofar++; } else n_excludedV_rare ++; } else n_excludedV_void ++; } else n_excludedV_uchr ++; } else n_excludedV_mult ++; if (n_line % 100000 == 0) vrb.bullet("#lines = " + stb.str(n_line)); n_line ++; } genotype_qtl = vector < bool > (genotype_pos.size(), false); genotype_gwas = vector < bool > (genotype_pos.size(), false); genotype_bin = vector < int > (genotype_pos.size(), -1); //Finalize bcf_sr_destroy(sr); vrb.bullet(stb.str(genotype_pos.size()) + " variants included"); if (n_excludedV_mult > 0) vrb.bullet(stb.str(n_excludedV_mult) + " multi-allelic variants excluded"); if (n_excludedV_uchr > 0) vrb.bullet(stb.str(n_excludedV_uchr) + " variants with unreferenced chromosome in --tss"); if (n_excludedV_rare > 0) vrb.bullet(stb.str(n_excludedV_rare) + " maf filtered variants"); if (n_excludedV_toofar > 0) vrb.bullet(stb.str(n_excludedV_toofar) + " too far variants"); }
static void cross_check_gts(args_t *args) { // Initialize things: check which tags are defined in the header, sample names etc. if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 ) { if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname); if ( !args->no_PLs ) { fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname); args->no_PLs = 99; } } args->nsmpl = bcf_hdr_nsamples(args->sm_hdr); args->narr = (args->nsmpl-1)*args->nsmpl/2; uint32_t *ndif = (uint32_t*) calloc(args->narr,4); uint32_t *ntot = (uint32_t*) calloc(args->narr,4); while ( bcf_sr_next_line(args->files) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); // use PLs unless no_PLs is set and GT exists if ( args->no_PLs ) { if ( process_GT(args,line,ntot,ndif)==0 ) continue; } process_PL(args,line,ntot,ndif); } FILE *fp = stdout; print_header(args, fp); float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2); // Output pairwise distances fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n"); int i,j, idx = 0; for (i=0; i<args->nsmpl; i++) { for (j=0; j<i; j++) { float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10; fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); PDIST(tmp,i,j) = err; idx++; } } // Cluster samples int nlist; float clust_max_err = args->max_intra_err; hclust_t *clust = hclust_init(args->nsmpl,tmp); cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist); fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n"); for (i=0; i<nlist; i++) { fprintf(fp,"CLUSTER\t%f", list[i].dist); for (j=0; j<list[i].nmemb; j++) fprintf(fp,"\t%s",args->sm_hdr->samples[list[i].memb[j]]); fprintf(fp,"\n"); } hclust_destroy_list(list,nlist); // Debugging output: the cluster graph and data used for deciding char **dbg = hclust_explain(clust,&nlist); for (i=0; i<nlist; i++) fprintf(fp,"DBG\t%s\n", dbg[i]); fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err); fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n"); fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err)); hclust_destroy(clust); free(tmp); // Deprecated output for temporary backward compatibility fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n"); fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n"); idx = 0; for (i=0; i<args->nsmpl; i++) { for (j=0; j<i; j++) { fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]); idx++; } } free(ndif); free(ntot); free(args->tmp_arr); }
int process_region_guess(args_t *args, char *seq, regitr_t *itr) { int kitr = 1; uint32_t start = 0, end = INT_MAX; reg_stats_t *stats = NULL; // set the start and the end position if ( itr ) { start = itr->reg[itr->i].start; end = itr->reg[itr->i].end; // flush all records with the same coordinates while ( itr->i+kitr<itr->n && start==itr->reg[itr->i+kitr].start && end==itr->reg[itr->i+kitr].end ) kitr++; int min,max,ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, &min, &max); assert(ret); stats = expand_regs(args, seq,start,end); } else { // background region int spos, epos; const char *ptr = hts_parse_reg(args->background, &spos, &epos); if ( !ptr ) error("Could not parse the region: %s\n", args->background); seq = (char*) malloc(ptr - args->background + 1); memcpy(seq,args->background,ptr-args->background); seq[ptr-args->background] = 0; start = spos; end = epos; } if ( bcf_sr_seek(args->sr,seq,start)!=0 ) { // sequence not present if ( !itr ) free(seq); return kitr; } int ismpl, rid = bcf_hdr_name2id(args->hdr,seq); if ( !itr ) free(seq); while ( bcf_sr_next_line(args->sr) ) { bcf1_t *rec = bcf_sr_get_line(args->sr,0); if ( rec->rid!=rid || rec->pos > end ) break; if ( args->guess & GUESS_GT ) // use GTs to guess the ploidy { bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT"); if ( !fmt ) continue; for (ismpl=0; ismpl<args->nsample; ismpl++) { count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; int gt = bcf_gt_type(fmt, ismpl, NULL,NULL); if ( gt==GT_UNKN ) counts->nmiss++; else if ( gt==GT_HET_RA || gt==GT_HET_AA ) counts->nhet++; else counts->nhom++; } } else // use PLs to guess the ploidy { int gl2pl = args->guess & GUESS_PL ? 1 : -1; int npl = bcf_get_format_int32(args->hdr,rec,args->guess&GUESS_PL?"PL":"GL",&args->pls,&args->npls); if ( npl<=0 ) continue; npl /= args->nsample; for (ismpl=0; ismpl<args->nsample; ismpl++) { int32_t *ptr = args->pls + ismpl*npl; int phom = INT_MAX, phet = INT_MAX, ial, jal, k = 0; for (ial=0; ial<rec->n_allele; ial++) { for (jal=0; jal<ial; jal++) { if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phet > ptr[k] ) phet = ptr[k]; k++; } if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end ) break; ptr[k] *= gl2pl; if ( phom > ptr[k] ) phom = ptr[k]; k++; } count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl]; if ( k == rec->n_allele ) counts->nhom++; // haploid else if ( phet == phom || k != rec->n_allele*(rec->n_allele+1)/2 ) counts->nmiss++; else if ( phet < phom ) counts->nhet++; else counts->nhom++; } } } return kitr; }
bcf1_t *get_line(size_t fileNum) { if (fileNum >= m_nFiles) throw std::range_error("fileNum is too large"); return bcf_sr_get_line(m_sr, fileNum); }