int process_region_precise(args_t *args, char *seq, regitr_t *itr)
    int k = 1;
    uint32_t start = itr->reg[itr->i].start, end = itr->reg[itr->i].end;
    while ( itr->i+k<itr->n && start==itr->reg[itr->i+k].start && end==itr->reg[itr->i+k].end ) k++;
    int ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, NULL, NULL);


    // Select 'nsites' sites spaced so that they evenly cover the whole region 
    // to get a representative sample. We index-jump as we should be checking
    // a few sites only.
    int i, rid = -1, pos, prev_pos = -1, ismpl;
    for (i=0; i<args->nsites; i++)
        rid = -1;
        pos = ((i+1.0)/(args->nsites+1))*(end - start) + start;
        if ( i>0 && pos <= prev_pos ) continue;     // the vcf is too sparse
        if ( bcf_sr_seek(args->sr,seq,pos)!=0 ) return k;   // sequence not present
        if ( !bcf_sr_next_line(args->sr) ) return k;        // no sites found
        bcf1_t *rec = bcf_sr_get_line(args->sr,0);
        if ( rid==-1 ) rid = rec->rid;
        if ( rid!=rec->rid || rec->pos > end ) break;
        prev_pos = rec->pos;

        int ngts = bcf_get_genotypes(args->hdr,rec,&args->gts,&args->ngts);
        ngts /= args->nsample;
        for (ismpl=0; ismpl<args->nsample; ismpl++)
            int32_t *gts = args->gts + ngts*ismpl;
            int igt, ploidy = 0;
            for (igt=0; igt<ngts; igt++)
                if ( gts[igt]==bcf_int32_vector_end || bcf_gt_is_missing(gts[igt]) ) break;
                else ploidy++;
            args->counts[ismpl*(args->max_ploidy+1) + ploidy]++;
            if ( args->verbose )
                fprintf(stderr,"%s:%d\t%s\tploidy=%d\n", seq,rec->pos+1,args->hdr->samples[ismpl],ploidy);

    for (ismpl=0; ismpl<args->nsample; ismpl++)
        float sum = 0, *probs = args->sex2prob + ismpl*args->nsex;
        int *counts = args->counts + ismpl*(args->max_ploidy+1);
        for (i=0; i<args->max_ploidy+1; i++) sum += counts[i];
        if ( !sum ) continue;
        for (i=0; i<args->nsex; i++)
            int ploidy = args->sex2ploidy[i];
            probs[i] *= counts[ploidy]/sum;

    return k;
void isec_vcf(args_t *args)
    bcf_srs_t *files = args->files;
    kstring_t str = {0,0,0};
    htsFile *out_fh = NULL;

    // When only one VCF is output, print VCF to pysam_stdout or -o file
    int out_std = 0;
    if ( args->nwrite==1 && !args->prefix ) out_std = 1;
    if ( args->targets_list && files->nreaders==1 ) out_std = 1;
    if ( out_std )
        out_fh = hts_open(args->output_fname? args->output_fname : "-",hts_bcf_wmode(args->output_type));
        if ( out_fh == NULL ) error("Can't write to %s: %s\n", args->output_fname? args->output_fname : "standard output", strerror(errno));
        if ( args->n_threads ) hts_set_threads(out_fh, args->n_threads);
        if (args->record_cmd_line) bcf_hdr_append_version(files->readers[args->iwrite].header,args->argc,args->argv,"bcftools_isec");
        bcf_hdr_write(out_fh, files->readers[args->iwrite].header);
    if ( !args->nwrite && !out_std && !args->prefix )
        fprintf(pysam_stderr,"Note: -w option not given, printing list of sites...\n");

    int n;
    while ( (n=bcf_sr_next_line(files)) )
        bcf_sr_t *reader = NULL;
        bcf1_t *line = NULL;
        int i, ret = 0;
        for (i=0; i<files->nreaders; i++)
            if ( !bcf_sr_has_line(files,i) ) continue;

            if ( args->nflt && args->flt[i] )
                bcf1_t *rec = bcf_sr_get_line(files, i);
                int pass = filter_test(args->flt[i], rec, NULL);
                if ( args->flt_logic[i] & FLT_EXCLUDE ) pass = pass ? 0 : 1;
                if ( !pass )
                    files->has_line[i] = 0;

            if ( !line )
                line = files->readers[i].buffer[0];
                reader = &files->readers[i];
            ret |= 1<<i;    // this may overflow for many files, but will be used only with two (OP_VENN)

        switch (args->isec_op)
            case OP_COMPLEMENT: if ( n!=1 || !bcf_sr_has_line(files,0) ) continue; break;
            case OP_EQUAL: if ( n != args->isec_n ) continue; break;
            case OP_PLUS: if ( n < args->isec_n ) continue; break;
            case OP_MINUS: if ( n > args->isec_n ) continue; break;
            case OP_EXACT:
                for (i=0; i<files->nreaders; i++)
                    if ( files->has_line[i] != args->isec_exact[i] ) break;
                if ( i<files->nreaders ) continue;

        if ( out_std )
            if ( bcf_sr_has_line(files,args->iwrite) )
                bcf_write1(out_fh, files->readers[args->iwrite].header, files->readers[args->iwrite].buffer[0]);
        else if ( args->fh_sites )
            str.l = 0;
            kputs(reader->header->id[BCF_DT_CTG][line->rid].key, &str); kputc('\t', &str);
            kputw(line->pos+1, &str); kputc('\t', &str);
            if (line->n_allele > 0) kputs(line->d.allele[0], &str);
            else kputc('.', &str);
            kputc('\t', &str);
            if (line->n_allele > 1) kputs(line->d.allele[1], &str);
            else kputc('.', &str);
            for (i=2; i<line->n_allele; i++)
                kputc(',', &str);
                kputs(line->d.allele[i], &str);
            kputc('\t', &str);
            for (i=0; i<files->nreaders; i++)
                kputc(bcf_sr_has_line(files,i)?'1':'0', &str);
            kputc('\n', &str);

        if ( args->prefix )
            if ( args->isec_op==OP_VENN && ret==3 )
                if ( !args->nwrite || args->write[0] )
                    bcf_write1(args->fh_out[2], bcf_sr_get_header(files,0), bcf_sr_get_line(files,0));
                if ( !args->nwrite || args->write[1] )
                    bcf_write1(args->fh_out[3], bcf_sr_get_header(files,1), bcf_sr_get_line(files,1));
                for (i=0; i<files->nreaders; i++)
                    if ( !bcf_sr_has_line(files,i) ) continue;
                    if ( args->write && !args->write[i] ) continue;
                    bcf_write1(args->fh_out[i], files->readers[i].header, files->readers[i].buffer[0]);
    if ( str.s ) free(str.s);
    if ( out_fh ) hts_close(out_fh);
static void init_data(args_t *args)
    bcf_srs_t *files = bcf_sr_init();
    if ( args->regions_list )
        if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 )
            error("Failed to read the regions: %s\n", args->regions_list);
    if ( args->targets_list )
        if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 )
            error("Failed to read the targets: %s\n", args->targets_list);
    if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum));
    bcf_hdr_t *hdr = files->readers[0].header;
    if ( !args->sample )
        if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n");
        args->sample = hdr->samples[0];
    else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample);
    int ret = bcf_hdr_set_samples(hdr, args->sample, 0);
    if ( ret<0 ) error("Error setting the sample: %s\n", args->sample);

    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) )
        error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname);

    int i;
    args->xvals = (double*) calloc(args->nbins,sizeof(double));
    for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1);

    // collect BAF distributions for all chromosomes
    int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1;
    float *baf = NULL;
    while ( bcf_sr_next_line(files) )

        bcf1_t *line = bcf_sr_get_line(files,0);
        if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue;
        if ( bcf_float_is_missing(baf[0]) ) continue;


        if ( prev_chr==-1 || prev_chr!=line->rid )
            // new chromosome
            idist = args->ndist++;
            args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist);
            args->dist[idist].chr   = strdup(bcf_seqname(hdr,line));
            args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double));
            args->dist[idist].xvals = args->xvals;
            args->dist[idist].nvals = args->nbins;
            prev_chr = line->rid;
        int bin = baf[0]*(args->nbins-1);
        args->dist[idist].yvals[bin]++;   // the distribution

    for (idist=0; idist<args->ndist; idist++)
        #if 0
            int j;
            for (j=0; j<args->nbins; j++)
                double x = args->dist[idist].xvals[j];
                args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3);
        init_dist(args, &args->dist[idist],args->verbose);

    args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir);
    fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version());
    fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]);
    for (i=1; i<args->argc; i++)
        fprintf(args->dat_fp, " %s",args->argv[i]);
    fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n");
    fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n");
    fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n");

    char *fname = NULL;
    FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir);
//-------- matplotlib script --------------
        "#!/usr/bin/env python\n"
        "import matplotlib as mpl\n"
        "import matplotlib.pyplot as plt\n"
        "import csv,sys,argparse\n"
        "from math import exp\n"
        "outdir = '%s'\n"
        "def read_dat(dat,fit,cn):\n"
        "   csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n"
        "   with open(outdir+'/dist.dat', 'rb') as f:\n"
        "      reader = csv.reader(f, 'tab')\n"
        "      for row in reader:\n"
        "          if row[0][0]=='#': continue\n"
        "          type = row[0]\n"
        "          chr  = row[1]\n"
        "          if type=='DIST':\n"
        "              if chr not in dat: dat[chr] = []\n"
        "              dat[chr].append(row)\n"
        "          elif type=='FIT':\n"
        "              if chr not in fit: fit[chr] = []\n"
        "              fit[chr].append(row)\n"
        "          elif type=='CN':\n"
        "              cn[chr] = row[2]\n"
        "def plot_dist(dat,fit,chr):\n"
        "   fig, ax = plt.subplots(1, 1, figsize=(7,5))\n"
        "   ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n"
        "   if chr in fit:\n"
        "       for i in range(len(fit[chr])):\n"
        "           pfit = fit[chr][i]\n"
        "           exec('def xfit(x): return '+pfit[5])\n"
        "           istart = int(pfit[3])\n"
        "           iend   = int(pfit[4])+1\n"
        "           vals   = dat[chr][istart:iend]\n"
        "           args   = {}\n"
        "           if i==0: args = {'label':'Target to Fit'}\n"
        "           ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n"
        "           if i==0: args = {'label':'Best Fit'}\n"
        "           ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n"
        "   ax.set_title('BAF distribution, chr'+chr)\n"
        "   ax.set_xlabel('BAF')\n"
        "   ax.set_ylabel('Frequency')\n"
        "   ax.legend(loc='best',prop={'size':7},frameon=False)\n"
        "   plt.savefig(outdir+'/dist.chr'+chr+'.png')\n"
        "   plt.close()\n"
        "def plot_copy_number(cn):\n"
        "   fig, ax = plt.subplots(1, 1, figsize=(7,5))\n"
        "   xlabels = sorted(cn.keys())\n"
        "   xvals = range(len(xlabels))\n"
        "   yvals = [float(cn[x]) for x in xlabels]\n"
        "   ax.plot(xvals,yvals,'o',color='red')\n"
        "   for i in range(len(xvals)):\n"
        "       if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n"
        "   ax.tick_params(axis='both', which='major', labelsize=9)\n"
        "   ax.set_xticks(xvals)\n"
        "   ax.set_xticklabels(xlabels,rotation=45)\n"
        "   ax.set_xlim(-1,len(xlabels))\n"
        "   ax.set_ylim(0,5.0)\n"
        "   ax.set_yticks([1.0,2.0,3.0,4.0])\n"
        "   ax.set_xlabel('Chromosome')\n"
        "   ax.set_ylabel('Copy Number')\n"
        "   plt.savefig(outdir+'/copy-number.png')\n"
        "   plt.close()\n"
        "class myParser(argparse.ArgumentParser):\n"
        "   def error(self, message):\n"
        "       self.print_help()\n"
        "       sys.stderr.write('error: %%s\\n' %% message)\n"
        "       sys.exit(2)\n"
        "def main():\n"
        "   parser = myParser()\n"
        "   parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n"
        "   parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n"
        "   parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n"
        "   args = parser.parse_args()\n"
        "   dat = {}; fit = {}; cn = {}\n"
        "   read_dat(dat,fit,cn)\n"
        "   if args.distrib!=None:\n"
        "       plot_dist(dat,fit,args.distrib)\n"
        "   if args.all:\n"
        "       for chr in dat: plot_dist(dat,fit,chr)\n"
        "       plot_copy_number(cn)\n"
        "   elif args.copy_number:\n"
        "       plot_copy_number(cn)\n"
        "   else:\n"
        "       for chr in dat: plot_dist(dat,fit,chr)\n"
        "if __name__ == '__main__':\n"
        "   main()\n",
static void concat(args_t *args)
    int i;
    if ( args->phased_concat )  // phased concat
        // keep only two open files at a time
        while ( args->ifname < args->nfnames )
            int new_file = 0;
            while ( args->files->nreaders < 2 && args->ifname < args->nfnames )
                if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
                new_file = 1;

                if ( args->start_pos[args->ifname-1]==-1 ) break;   // new chromosome, start with only one file open
                if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome

            // is there a line from the previous run? Seek the newly opened reader to that position
            int seek_pos = -1;
            int seek_chr = -1;
            if ( bcf_sr_has_line(args->files,0) )
                bcf1_t *line = bcf_sr_get_line(args->files,0);
                bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos);
                seek_pos = line->pos;
                seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line));
            else if ( new_file )
                bcf_sr_seek(args->files,NULL,0);  // set to start

            int nret;
            while ( (nret = bcf_sr_next_line(args->files)) )
                if ( !bcf_sr_has_line(args->files,0) )  // no input from the first reader
                    // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
                    if ( ! bcf_sr_region_done(args->files,0) ) continue;

                    bcf_sr_remove_reader(args->files, 0);

                // Get a line to learn about current position
                for (i=0; i<args->files->nreaders; i++)
                    if ( bcf_sr_has_line(args->files,i) ) break;
                bcf1_t *line = bcf_sr_get_line(args->files,i);

                // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to.
                if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue;
                seek_pos = seek_chr = -1;

                //  Check if the position overlaps with the next, yet unopened, reader
                int must_seek = 0;
                while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] )
                    must_seek = 1;
                    if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum));
                if ( must_seek )
                    bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos);
                    seek_pos = line->pos;
                    seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line));

                // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped
                if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue;

                phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL);

            if ( args->files->nreaders )
                while ( args->files->nreaders )
                    bcf_sr_remove_reader(args->files, 0);
    else if ( args->files )  // combining overlapping files, using synced reader
        while ( bcf_sr_next_line(args->files) )
            for (i=0; i<args->files->nreaders; i++)
                bcf1_t *line = bcf_sr_get_line(args->files,i);
                if ( !line ) continue;
                bcf_translate(args->out_hdr, args->files->readers[i].header, line);
                bcf_write1(args->out_fh, args->out_hdr, line);
                if ( args->remove_dups ) break;
    else    // concatenating
        kstring_t tmp = {0,0,0};
        int prev_chr_id = -1, prev_pos;
        bcf1_t *line = bcf_init();
        for (i=0; i<args->nfnames; i++)
            htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]);
            bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]);
            if ( !fp->is_bin && args->output_type&FT_VCF )
                line->max_unpack = BCF_UN_STR;
                // if VCF is on both input and output, avoid VCF to BCF conversion
                while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 )
                    char *str = fp->line.s;
                    while ( *str && *str!='\t' ) str++;
                    tmp.l = 0;
                    int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s);
                    if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]);
                    if ( prev_chr_id!=chr_id )
                        prev_pos = -1;
                        if ( args->seen_seq[chr_id] )
                            error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s);
                    char *end;
                    int pos = strtol(str+1,&end,10) - 1;
                    if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s);
                    if ( prev_pos > pos )
                        error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s);
                    args->seen_seq[chr_id] = 1;
                    prev_chr_id = chr_id;

                    if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l);
                // BCF conversion is required
                line->max_unpack = 0;
                while ( bcf_read(fp, hdr, line)==0 )
                    bcf_translate(args->out_hdr, hdr, line);

                    if ( prev_chr_id!=line->rid )
                        prev_pos = -1;
                        if ( args->seen_seq[line->rid] )
                            error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
                    if ( prev_pos > line->pos )
                        error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line));
                    args->seen_seq[line->rid] = 1;
                    prev_chr_id = line->rid;

                    if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n");
void union_data::readGenotypesVCF(string fvcf,string region) {
	int n_includedG = 0;
	int n_excludedG_mult = 0;
	int n_excludedG_void = 0;
	int n_excludedG_user = 0;
	int n_includedS = 0;
	vector < int > mappingS;

	//Opening files
	bcf_srs_t * sr =  bcf_sr_init();

    //vrb.bullet("target region [" + regionGenotype.get() + "]");
    //if (bcf_sr_set_regions(sr, regionGenotype.get().c_str(), 0) == -1) vrb.error("Cannot jump to region!");
	bcf_sr_set_regions(sr, region.c_str(), 0);
	if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
		switch (sr->errnum) {
		case not_bgzf: vrb.error("File not compressed with bgzip!");
		case idx_load_failed: vrb.error("Impossible to load index file!");
		case file_type_error: vrb.error("File format not detected by htslib!");
		default : vrb.error("Unknown error!");

	//Sample processing
	int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
	for (int i0 = 0 ; i0 < n_samples ; i0 ++) {
		if (mappingS.back() >= 0) n_includedS++;

	//Read genotype data
	int ngt, ngt_arr = 0, nds, nds_arr = 0, * gt_arr = NULL, nsl, nsl_arr = 0, * sl_arr = NULL;
	float * ds_arr = NULL;
	bcf1_t * line;
    unsigned int linecount = 0;
	while(bcf_sr_next_line (sr)) {
        linecount ++;
        if (linecount % 100000 == 0) vrb.bullet("Read " + stb.str(linecount) + " lines");
		line =  bcf_sr_get_line(sr, 0);
		if (line->n_allele == 2) {
			ngt = bcf_get_genotypes(sr->readers[0].header, line, &gt_arr, &ngt_arr);
			nds = bcf_get_format_float(sr->readers[0].header, line,"DS", &ds_arr, &nds_arr);
			if (nds == n_samples || ngt == 2*n_samples) {
				bcf_unpack(line, BCF_UN_STR);
				string sid = string(line->d.id);
				if (filter_genotype.check(sid)) {
					genotype_chr.push_back(string(bcf_hdr_id2name(sr->readers[0].header, line->rid)));
					string genotype_ref = string(line->d.allele[0]);
					genotype_start.push_back(line->pos + 1);
					nsl = bcf_get_info_int32(sr->readers[0].header, line, "END", &sl_arr, &nsl_arr);
					if (nsl >= 0 && nsl_arr == 1) genotype_end.push_back(sl_arr[0]);
					else genotype_end.push_back(genotype_start.back() + genotype_ref.size() - 1);
					genotype_val.push_back(vector < float > (sample_count, 0.0));

					for(int i = 0 ; i < n_samples ; i ++) {
						if (mappingS[i] >= 0) {
							if (nds > 0) genotype_val.back()[mappingS[i]] = ds_arr[i];
							else {
								if (gt_arr[2*i+0] == bcf_gt_missing || gt_arr[2*i+1] == bcf_gt_missing) genotype_val.back()[mappingS[i]] = bcf_float_missing;
								else genotype_val.back()[mappingS[i]] = bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]);
                    pair < string, int > temp (sid,n_includedG);
				} else n_excludedG_user ++;
			} else n_excludedG_void ++;
		} else n_excludedG_mult ++;

	genotype_count = n_includedG;
	//vrb.bullet(stb.str(n_includedG) + " variants included");
	//if (n_excludedG_user > 0) vrb.bullet(stb.str(n_excludedG_user) + " variants excluded by user");
	//if (n_excludedG_mult > 0) vrb.bullet(stb.str(n_excludedG_mult) + " multi-allelic variants excluded");
	//if (n_excludedG_void > 0) vrb.bullet(stb.str(n_excludedG_void) + " uninformative variants excluded [no GT/DS]");
    //if (genotype_count == 0) vrb.leave("Cannot find genotypes in target region!");
int main_plugin(int argc, char *argv[])
    int c;
    args_t *args  = (args_t*) calloc(1,sizeof(args_t));
    args->argc    = argc; args->argv = argv;
    args->files   = bcf_sr_init();
    args->output_fname = "-";
    args->output_type = FT_VCF;
    args->nplugin_paths = -1;
    int regions_is_file = 0, targets_is_file = 0, plist_only = 0;

    if ( argc==1 ) usage(args);
    char *plugin_name = NULL;
    if ( argv[1][0]!='-' ) { plugin_name = argv[1]; argc--; argv++; }

    static struct option loptions[] =
    while ((c = getopt_long(argc, argv, "h?o:O:r:R:li:e:v",loptions,NULL)) >= 0)
        switch (c) {
            case 'v': args->verbose = 1; break;
            case 'o': args->output_fname = optarg; break;
            case 'O':
                switch (optarg[0]) {
                    case 'b': args->output_type = FT_BCF_GZ; break;
                    case 'u': args->output_type = FT_BCF; break;
                    case 'z': args->output_type = FT_VCF_GZ; break;
                    case 'v': args->output_type = FT_VCF; break;
                    default: error("The output type \"%s\" not recognised\n", optarg);
            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
            case 'r': args->regions_list = optarg; break;
            case 'R': args->regions_list = optarg; regions_is_file = 1; break;
            case 't': args->targets_list = optarg; break;
            case 'T': args->targets_list = optarg; targets_is_file = 1; break;
            case 'l': plist_only = 1; break;
            case '?':
            case 'h': load_plugin(args, plugin_name, 1, &args->plugin); fprintf(stderr,"%s",args->plugin.usage()); return 0; break;
            default: error("Unknown argument: %s\n", optarg);
    if ( plist_only )  return list_plugins(args);

    char *fname = NULL;
    if ( optind>=argc || argv[optind][0]=='-' )
        if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
        else usage(args);
        args->plugin.argc = argc - optind + 1;
        args->plugin.argv = argv + optind - 1;
        fname = argv[optind];
        args->plugin.argc = argc - optind;
        args->plugin.argv = argv + optind;
    optind = 0;
    args->plugin.argv[0] = plugin_name;
    load_plugin(args, plugin_name, 1, &args->plugin);

    if ( args->regions_list )
        if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
            error("Failed to read the regions: %s\n", args->regions_list);
    if ( args->targets_list )
        if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
            error("Failed to read the targets: %s\n", args->targets_list);
        args->files->collapse |= COLLAPSE_SOME;
    if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open or the file not indexed: %s\n", fname);

    while ( bcf_sr_next_line(args->files) )
        bcf1_t *line = bcf_sr_get_line(args->files,0);
        if ( args->filter )
            int pass = filter_test(args->filter, line, NULL);
            if ( args->filter_logic & FLT_EXCLUDE ) pass = pass ? 0 : 1;
            if ( !pass ) continue;
        line = args->plugin.process(line);
        if ( line ) bcf_write1(args->out_fh, args->hdr_out, line);
    return 0;
int run(int argc, char **argv)
    char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL;
    args.mode = 0;
    args.output_fname = "-";

    static struct option loptions[] =
    int c;
    while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0)
        switch (c) 
            case 'o': args.output_fname = optarg; break;
            case 'O':
                      switch (optarg[0]) {
                          case 'b': args.output_type = FT_BCF_GZ; break;
                          case 'u': args.output_type = FT_BCF; break;
                          case 'z': args.output_type = FT_VCF_GZ; break;
                          case 'v': args.output_type = FT_VCF; break;
                          default: error("The output type \"%s\" not recognised\n", optarg);
            case 'R': rules_fname = optarg; break;
            case 'r': rules_string = optarg; break;
            case 'd': args.mode |= MODE_DELETE; break;
            case 'c': args.mode |= MODE_COUNT; break;
            case 'l': 
                if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; 
                else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; 
                else error("The argument not recognised: --list %s\n", optarg);
            case 't': trio_samples = optarg; break;
            case 'T': trio_file = optarg; break;
            case 'h':
            case '?':
            default: error("%s",usage()); break;
    if ( rules_fname )
        args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args);
        args.rules = init_rules(&args, rules_string);
    if ( !args.rules ) return -1;
    args.itr     = regitr_init(args.rules);
    args.itr_ori = regitr_init(args.rules);

    char *fname = NULL;
    if ( optind>=argc || argv[optind][0]=='-' )
        if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
        else error("%s",usage());
        fname = argv[optind];

    if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n");
    if ( !args.mode ) error("Expected one of the -c, -d or -l options\n");
    if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD;

    args.sr = bcf_sr_init();
    if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum));
    args.hdr = bcf_sr_get_header(args.sr, 0);
    args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type));
    if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno));
    bcf_hdr_write(args.out_fh, args.hdr);

    int i, n = 0;
    char **list;
    if ( trio_samples )
        args.ntrios = 1;
        args.trios = (trio_t*) calloc(1,sizeof(trio_t));
        list = hts_readlist(trio_samples, 0, &n);
        if ( n!=3 ) error("Expected three sample names with -t\n");
        args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]);
        args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]);
        args.trios[0].ichild  = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]);
        for (i=0; i<n; i++) free(list[i]);
    if ( trio_file )
        list = hts_readlist(trio_file, 1, &n);
        args.ntrios = n;
        args.trios = (trio_t*) calloc(n,sizeof(trio_t));
        for (i=0; i<n; i++)
            char *ss = list[i], *se;
            se = strchr(ss, ',');
            if ( !se ) error("Could not parse %s: %s\n",trio_file, ss);
            *se = 0;
            args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
            if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss);
            ss = ++se; 
            se = strchr(ss, ',');
            if ( !se ) error("Could not parse %s\n",trio_file);
            *se = 0;
            args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
            if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss);
            ss = ++se; 
            if ( *ss=='\0' ) error("Could not parse %s\n",trio_file);
            args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
            if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss);

    while ( bcf_sr_next_line(args.sr) )
        bcf1_t *line = bcf_sr_get_line(args.sr,0);
        line = process(line);
        if ( line )
            if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode);
            bcf_write1(args.out_fh, args.hdr, line);

    fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n");
    for (i=0; i<args.ntrios; i++)
        trio_t *trio = &args.trios[i];
            bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother),
            bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather),
            bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild)
    if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n");
    return 0;
void genrich_data::readReferenceGenotypes(string fvcf) {
	vector < int > mappingS;

	//Opening files
	vrb.title("Reading variant list in [" + fvcf + "] MAF=" + stb.str(threshold_maf));
	bcf_srs_t * sr =  bcf_sr_init();
	if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) {
		switch (sr->errnum) {
		case not_bgzf: vrb.error("File not compressed with bgzip!");
		case idx_load_failed: vrb.error("Impossible to load index file!");
		case file_type_error: vrb.error("File format not detected by htslib!");
		default : vrb.error("Unknown error!");

	//Sample processing
	int included_sample = 0;
	int n_samples = bcf_hdr_nsamples(sr->readers[0].header);
	for (int i = 0 ; i < n_samples ; i ++) {
		if (mappingS.back() >= 0) included_sample ++;
	vrb.bullet("#samples = " + stb.str(included_sample));

	//Variant processing
	unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0, n_excludedV_toofar = 0;
	int ngt, ngt_arr = 0, *gt_arr = NULL;
	bcf1_t * line;
	while(bcf_sr_next_line (sr)) {
		line =  bcf_sr_get_line(sr, 0);
		if (line->n_allele == 2) {
			bcf_unpack(line, BCF_UN_STR);
			string sid = string(line->d.id);
			string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid));
			int chr_idx = findCHR(chr);
			if (chr_idx >= 0) {
				unsigned int pos = line->pos + 1;
				ngt = bcf_get_genotypes(sr->readers[0].header, line, &gt_arr, &ngt_arr);
				if (ngt == 2*n_samples) {
					double freq = 0.0, tot = 0.0;
					for(int i = 0 ; i < n_samples ; i ++) {
						assert(gt_arr[2*i+0] != bcf_gt_missing && gt_arr[2*i+1] != bcf_gt_missing);
						if (mappingS[i] >= 0) {
							freq += bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]);
							tot += 2.0;
					double maf = freq / tot;
					if (maf > 0.5) maf = 1.0 - maf;
					if (maf >= threshold_maf) {
						int dist_tss = getDistance(chr_idx, pos);
						if (dist_tss < 1e6) {
							string tmp_id = chr + "_" + stb.str(pos);
							genotype_uuid.insert(pair < string, unsigned int > (tmp_id, genotype_pos.size()));
							genotype_haps.push_back(vector < bool > (2 * included_sample, false));
							for(int i = 0 ; i < n_samples ; i ++) {
								if (mappingS[i] >= 0) {
									genotype_haps.back()[2 * mappingS[i] + 0] = bcf_gt_allele(gt_arr[2 * i + 0]);
									genotype_haps.back()[2 * mappingS[i] + 1] = bcf_gt_allele(gt_arr[2 * i + 1]);
						} else n_excludedV_toofar++;
					} else n_excludedV_rare ++;
				} else n_excludedV_void ++;
			} else n_excludedV_uchr ++;
		} else n_excludedV_mult ++;

		if (n_line % 100000 == 0) vrb.bullet("#lines = " + stb.str(n_line));

		n_line ++;
	genotype_qtl = vector < bool > (genotype_pos.size(), false);
	genotype_gwas = vector < bool > (genotype_pos.size(), false);
	genotype_bin = vector < int > (genotype_pos.size(), -1);

	vrb.bullet(stb.str(genotype_pos.size()) + " variants included");
	if (n_excludedV_mult > 0) vrb.bullet(stb.str(n_excludedV_mult) + " multi-allelic variants excluded");
	if (n_excludedV_uchr > 0) vrb.bullet(stb.str(n_excludedV_uchr) + " variants with unreferenced chromosome in --tss");
	if (n_excludedV_rare > 0) vrb.bullet(stb.str(n_excludedV_rare) + " maf filtered variants");
	if (n_excludedV_toofar > 0) vrb.bullet(stb.str(n_excludedV_toofar) + " too far variants");
static void cross_check_gts(args_t *args)
    // Initialize things: check which tags are defined in the header, sample names etc.
    if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "PL")<0 )
        if ( bcf_hdr_id2int(args->sm_hdr, BCF_DT_ID, "GT")<0 )
            error("[E::%s] Neither PL nor GT present in the header of %s\n", __func__, args->files->readers[0].fname);
        if ( !args->no_PLs ) {
            fprintf(stderr,"Warning: PL not present in the header of %s, using GT instead\n", args->files->readers[0].fname);
            args->no_PLs = 99;

    args->nsmpl = bcf_hdr_nsamples(args->sm_hdr);
    args->narr  = (args->nsmpl-1)*args->nsmpl/2;

    uint32_t *ndif = (uint32_t*) calloc(args->narr,4);
    uint32_t *ntot = (uint32_t*) calloc(args->narr,4);

    while ( bcf_sr_next_line(args->files) )
        bcf1_t *line = bcf_sr_get_line(args->files,0);

        // use PLs unless no_PLs is set and GT exists
        if ( args->no_PLs )
            if ( process_GT(args,line,ntot,ndif)==0 ) continue;
    FILE *fp = stdout;
    print_header(args, fp);

    float *tmp = (float*)malloc(sizeof(float)*args->nsmpl*(args->nsmpl-1)/2);

    // Output pairwise distances
    fprintf(fp, "# ERR, error rate\t[2]Pairwise error rate\t[3]Number of sites compared\t[4]Sample i\t[5]Sample j\n");
    int i,j, idx = 0;
    for (i=0; i<args->nsmpl; i++)
        for (j=0; j<i; j++)
            float err = ntot[idx] ? (float)ndif[idx]/ntot[idx] : 1e-10;
            fprintf(fp, "ERR\t%f\t%"PRId32"\t%s\t%s\n", err, ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);
            PDIST(tmp,i,j) = err;

    // Cluster samples
    int nlist;
    float clust_max_err = args->max_intra_err;
    hclust_t *clust = hclust_init(args->nsmpl,tmp);
    cluster_t *list = hclust_create_list(clust,args->min_inter_err,&clust_max_err,&nlist);
    fprintf(fp, "# CLUSTER\t[2]Maximum inter-cluster ERR\t[3-]List of samples\n");
    for (i=0; i<nlist; i++)
        fprintf(fp,"CLUSTER\t%f", list[i].dist);
        for (j=0; j<list[i].nmemb; j++)
    // Debugging output: the cluster graph and data used for deciding
    char **dbg = hclust_explain(clust,&nlist);
    for (i=0; i<nlist; i++)
        fprintf(fp,"DBG\t%s\n", dbg[i]);
    fprintf(fp, "# TH, clustering threshold\t[2]Value\nTH\t%f\n",clust_max_err);
    fprintf(fp, "# DOT\t[2]Cluster graph, visualize e.g. as \"this-output.txt | grep ^DOT | cut -f2- | dot -Tsvg -o graph.svg\"\n");
    fprintf(fp, "DOT\t%s\n", hclust_create_dot(clust,args->sm_hdr->samples,clust_max_err));

    // Deprecated output for temporary backward compatibility
    fprintf(fp, "# Warning: The CN block is deprecated and will be removed in future releases. Use ERR instead.\n");
    fprintf(fp, "# [1]CN\t[2]Discordance\t[3]Number of sites\t[4]Average minimum depth\t[5]Sample i\t[6]Sample j\n");
    idx = 0;
    for (i=0; i<args->nsmpl; i++)
        for (j=0; j<i; j++)
            fprintf(fp, "CN\t%"PRId32"\t%"PRId32"\t0\t%s\t%s\n", ndif[idx], ntot[idx],args->sm_hdr->samples[i],args->sm_hdr->samples[j]);

int process_region_guess(args_t *args, char *seq, regitr_t *itr)
    int kitr = 1;
    uint32_t start = 0, end = INT_MAX;
    reg_stats_t *stats = NULL;

    // set the start and the end position
    if ( itr )
        start = itr->reg[itr->i].start;
        end   = itr->reg[itr->i].end;

        // flush all records with the same coordinates
        while ( itr->i+kitr<itr->n && start==itr->reg[itr->i+kitr].start && end==itr->reg[itr->i+kitr].end ) kitr++;

        int min,max,ret = ploidy_query(args->ploidy, seq, start, args->sex2ploidy, &min, &max);
        stats = expand_regs(args, seq,start,end);
        // background region
        int spos, epos;
        const char *ptr = hts_parse_reg(args->background, &spos, &epos);
        if ( !ptr )
            error("Could not parse the region: %s\n", args->background);
        seq = (char*) malloc(ptr - args->background + 1);
        seq[ptr-args->background] = 0;
        start = spos;
        end   = epos;

    if ( bcf_sr_seek(args->sr,seq,start)!=0 ) 
        // sequence not present
        if ( !itr ) free(seq);
        return kitr;

    int ismpl, rid = bcf_hdr_name2id(args->hdr,seq);
    if ( !itr ) free(seq);

    while ( bcf_sr_next_line(args->sr) )
        bcf1_t *rec = bcf_sr_get_line(args->sr,0);
        if ( rec->rid!=rid || rec->pos > end ) break;

        if ( args->guess & GUESS_GT )   // use GTs to guess the ploidy
            bcf_fmt_t *fmt = bcf_get_fmt(args->hdr, rec, "GT");
            if ( !fmt ) continue;
            for (ismpl=0; ismpl<args->nsample; ismpl++)
                count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl];
                int gt = bcf_gt_type(fmt, ismpl, NULL,NULL);
                if ( gt==GT_UNKN ) counts->nmiss++;
                else if ( gt==GT_HET_RA || gt==GT_HET_AA ) counts->nhet++;
                else counts->nhom++;
        else    // use PLs to guess the ploidy
            int gl2pl = args->guess & GUESS_PL ? 1 : -1;
            int npl = bcf_get_format_int32(args->hdr,rec,args->guess&GUESS_PL?"PL":"GL",&args->pls,&args->npls);
            if ( npl<=0 ) continue;
            npl /= args->nsample;
            for (ismpl=0; ismpl<args->nsample; ismpl++)
                int32_t *ptr = args->pls + ismpl*npl;
                int phom = INT_MAX, phet = INT_MAX, ial, jal, k = 0;
                for (ial=0; ial<rec->n_allele; ial++)
                    for (jal=0; jal<ial; jal++)
                        if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end )  break;
                        ptr[k] *= gl2pl;
                        if ( phet > ptr[k] ) phet = ptr[k];
                    if ( ptr[k] == bcf_int32_missing || ptr[k] == bcf_int32_vector_end )  break;
                    ptr[k] *= gl2pl;
                    if ( phom > ptr[k] ) phom = ptr[k];
                count_t *counts = stats ? &stats->counts[ismpl] : &args->bg_counts[ismpl];
                if ( k == rec->n_allele ) counts->nhom++;   // haploid
                else if ( phet == phom || k != rec->n_allele*(rec->n_allele+1)/2 ) counts->nmiss++;
                else if ( phet < phom ) counts->nhet++;
                else counts->nhom++;
    return kitr;
 bcf1_t *get_line(size_t fileNum) {
   if (fileNum >= m_nFiles)
     throw std::range_error("fileNum is too large");
   return bcf_sr_get_line(m_sr, fileNum);