Exemplo n.º 1
0
static void init_data(args_t *args)
{
    args->header = args->files->readers[0].header;

    int i, nsamples = 0, *samples = NULL;
    if ( args->sample_list && strcmp("-",args->sample_list) )
    {
        for (i=0; i<args->files->nreaders; i++)
        {
            int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
            if ( ret<0 ) error("Error parsing the sample list\n");
            else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
        }

        if ( args->sample_list[0]!='^' )
        {
            // the sample ordering may be different if not negated
            int n;
            char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
            if ( !smpls ) error("Could not parse %s\n", args->sample_list);
            if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
                error("The number of samples does not match, perhaps some are present multiple times?\n");
            nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
            samples = (int*) malloc(sizeof(int)*nsamples);
            for (i=0; i<n; i++)
            {
                samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
                free(smpls[i]);
            }
            free(smpls);
        }
    }
    args->convert = convert_init(args->header, samples, nsamples, args->format_str);
    if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
    free(samples);

    int max_unpack = convert_max_unpack(args->convert);
    if ( args->filter_str )
    {
        args->filter = filter_init(args->header, args->filter_str);
        max_unpack |= filter_max_unpack(args->filter);
    }
    args->files->max_unpack = max_unpack;
}
Exemplo n.º 2
0
static void init_data(args_t *args)
{
    bcf_srs_t *files = bcf_sr_init();
    if ( args->regions_list )
    {
        if ( bcf_sr_set_regions(files, args->regions_list, args->regions_is_file)<0 )
            error("Failed to read the regions: %s\n", args->regions_list);
    }
    if ( args->targets_list )
    {
        if ( bcf_sr_set_targets(files, args->targets_list, args->targets_is_file, 0)<0 )
            error("Failed to read the targets: %s\n", args->targets_list);
    }
    if ( !bcf_sr_add_reader(files, args->fname) ) error("Failed to open %s: %s\n", args->fname,bcf_sr_strerror(files->errnum));
    bcf_hdr_t *hdr = files->readers[0].header;
    if ( !args->sample )
    {
        if ( bcf_hdr_nsamples(hdr)>1 ) error("Missing the option -s, --sample\n");
        args->sample = hdr->samples[0];
    }
    else if ( bcf_hdr_id2int(hdr,BCF_DT_SAMPLE,args->sample)<0 ) error("No such sample: %s\n", args->sample);
    int ret = bcf_hdr_set_samples(hdr, args->sample, 0);
    if ( ret<0 ) error("Error setting the sample: %s\n", args->sample);

    if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,bcf_hdr_id2int(hdr,BCF_DT_ID,"BAF")) )
        error("The tag FORMAT/BAF is not present in the VCF: %s\n", args->fname);

    int i;
    args->xvals = (double*) calloc(args->nbins,sizeof(double));
    for (i=0; i<args->nbins; i++) args->xvals[i] = 1.0*i/(args->nbins-1);

    // collect BAF distributions for all chromosomes
    int idist = -1, nbaf = 0, nprocessed = 0, ntotal = 0, prev_chr = -1;
    float *baf = NULL;
    while ( bcf_sr_next_line(files) )
    {
        ntotal++;

        bcf1_t *line = bcf_sr_get_line(files,0);
        if ( bcf_get_format_float(hdr,line,"BAF",&baf,&nbaf) != 1 ) continue;
        if ( bcf_float_is_missing(baf[0]) ) continue;

        nprocessed++;

        if ( prev_chr==-1 || prev_chr!=line->rid )
        {
            // new chromosome
            idist = args->ndist++;
            args->dist = (dist_t*) realloc(args->dist, sizeof(dist_t)*args->ndist);
            memset(&args->dist[idist],0,sizeof(dist_t));
            args->dist[idist].chr   = strdup(bcf_seqname(hdr,line));
            args->dist[idist].yvals = (double*) calloc(args->nbins,sizeof(double));
            args->dist[idist].xvals = args->xvals;
            args->dist[idist].nvals = args->nbins;
            prev_chr = line->rid;
        }
        int bin = baf[0]*(args->nbins-1);
        args->dist[idist].yvals[bin]++;   // the distribution
    }
    free(baf);
    bcf_sr_destroy(files);

    for (idist=0; idist<args->ndist; idist++)
    {
        #if 0
            int j;
            for (j=0; j<args->nbins; j++)
            {
                double x = args->dist[idist].xvals[j];
                args->dist[idist].yvals[j] = exp(-(x-0.5)*(x-0.5)/1e-3);
            }
        #endif
        init_dist(args, &args->dist[idist],args->verbose);
    }

    args->dat_fp = open_file(&args->dat_fname,"w","%s/dist.dat", args->output_dir);
    fprintf(args->dat_fp, "# This file was produced by: bcftools polysomy(%s+htslib-%s), the command line was:\n", bcftools_version(),hts_version());
    fprintf(args->dat_fp, "# \t bcftools %s ", args->argv[0]);
    for (i=1; i<args->argc; i++)
        fprintf(args->dat_fp, " %s",args->argv[i]);
    fprintf(args->dat_fp,"\n#\n");
    fprintf(args->dat_fp,"# DIST\t[2]Chrom\t[3]BAF\t[4]Normalized Count\n");
    fprintf(args->dat_fp,"# FIT\t[2]Goodness of Fit\t[3]iFrom\t[4]iTo\t[5]The Fitted Function\n");
    fprintf(args->dat_fp,"# CN\t[2]Chrom\t[3]Estimated Copy Number\t[4]Absolute fit deviation\n");

    char *fname = NULL;
    FILE *fp = open_file(&fname,"w","%s/dist.py", args->output_dir);
//-------- matplotlib script --------------
    fprintf(fp,
        "#!/usr/bin/env python\n"
        "#\n"
        "import matplotlib as mpl\n"
        "mpl.use('Agg')\n"
        "import matplotlib.pyplot as plt\n"
        "import csv,sys,argparse\n"
        "from math import exp\n"
        "\n"
        "outdir = '%s'\n"
        "\n"
        "def read_dat(dat,fit,cn):\n"
        "   csv.register_dialect('tab', delimiter='\t', quoting=csv.QUOTE_NONE)\n"
        "   with open(outdir+'/dist.dat', 'rb') as f:\n"
        "      reader = csv.reader(f, 'tab')\n"
        "      for row in reader:\n"
        "          if row[0][0]=='#': continue\n"
        "          type = row[0]\n"
        "          chr  = row[1]\n"
        "          if type=='DIST':\n"
        "              if chr not in dat: dat[chr] = []\n"
        "              dat[chr].append(row)\n"
        "          elif type=='FIT':\n"
        "              if chr not in fit: fit[chr] = []\n"
        "              fit[chr].append(row)\n"
        "          elif type=='CN':\n"
        "              cn[chr] = row[2]\n"
        "\n"
        "def plot_dist(dat,fit,chr):\n"
        "   fig, ax = plt.subplots(1, 1, figsize=(7,5))\n"
        "   ax.plot([x[2] for x in dat[chr]],[x[3] for x in dat[chr]],'k-',label='Distribution')\n"
        "   if chr in fit:\n"
        "       for i in range(len(fit[chr])):\n"
        "           pfit = fit[chr][i]\n"
        "           exec('def xfit(x): return '+pfit[5])\n"
        "           istart = int(pfit[3])\n"
        "           iend   = int(pfit[4])+1\n"
        "           vals   = dat[chr][istart:iend]\n"
        "           args   = {}\n"
        "           if i==0: args = {'label':'Target to Fit'}\n"
        "           ax.plot([x[2] for x in vals],[x[3] for x in vals],'r-',**args)\n"
        "           if i==0: args = {'label':'Best Fit'}\n"
        "           ax.plot([x[2] for x in vals],[xfit(float(x[2])) for x in vals],'g-',**args)\n"
        "   ax.set_title('BAF distribution, chr'+chr)\n"
        "   ax.set_xlabel('BAF')\n"
        "   ax.set_ylabel('Frequency')\n"
        "   ax.legend(loc='best',prop={'size':7},frameon=False)\n"
        "   plt.savefig(outdir+'/dist.chr'+chr+'.png')\n"
        "   plt.close()\n"
        "\n"
        "def plot_copy_number(cn):\n"
        "   fig, ax = plt.subplots(1, 1, figsize=(7,5))\n"
        "   xlabels = sorted(cn.keys())\n"
        "   xvals = range(len(xlabels))\n"
        "   yvals = [float(cn[x]) for x in xlabels]\n"
        "   ax.plot(xvals,yvals,'o',color='red')\n"
        "   for i in range(len(xvals)):\n"
        "       if yvals[i]==-1: ax.annotate('?', xy=(xvals[i],0.5),va='center',ha='center',color='red',fontweight='bold')\n"
        "   ax.tick_params(axis='both', which='major', labelsize=9)\n"
        "   ax.set_xticks(xvals)\n"
        "   ax.set_xticklabels(xlabels,rotation=45)\n"
        "   ax.set_xlim(-1,len(xlabels))\n"
        "   ax.set_ylim(0,5.0)\n"
        "   ax.set_yticks([1.0,2.0,3.0,4.0])\n"
        "   ax.set_xlabel('Chromosome')\n"
        "   ax.set_ylabel('Copy Number')\n"
        "   plt.savefig(outdir+'/copy-number.png')\n"
        "   plt.close()\n"
        "\n"
        "class myParser(argparse.ArgumentParser):\n"
        "   def error(self, message):\n"
        "       self.print_help()\n"
        "       sys.stderr.write('error: %%s\\n' %% message)\n"
        "       sys.exit(2)\n"
        "\n"
        "def main():\n"
        "   parser = myParser()\n"
        "   parser.add_argument('-a', '--all', action='store_true', help='Create all plots')\n"
        "   parser.add_argument('-c', '--copy-number', action='store_true', help='Create copy-number plot')\n"
        "   parser.add_argument('-d', '--distrib', metavar='CHR', help='Plot BAF distribution of a single chromosome')\n"
        "   args = parser.parse_args()\n"
        "   dat = {}; fit = {}; cn = {}\n"
        "   read_dat(dat,fit,cn)\n"
        "   if args.distrib!=None:\n"
        "       plot_dist(dat,fit,args.distrib)\n"
        "   if args.all:\n"
        "       for chr in dat: plot_dist(dat,fit,chr)\n"
        "       plot_copy_number(cn)\n"
        "   elif args.copy_number:\n"
        "       plot_copy_number(cn)\n"
        "   else:\n"
        "       for chr in dat: plot_dist(dat,fit,chr)\n"
        "\n"
        "if __name__ == '__main__':\n"
        "   main()\n",
        args->output_dir);
//---------------------------------------
    chmod(fname, S_IWUSR|S_IRUSR|S_IRGRP|S_IROTH|S_IXUSR|S_IXGRP|S_IXOTH);
    free(fname);
    fclose(fp);
}
Exemplo n.º 3
0
Arquivo: query.c Projeto: CoREse/gqt
//{{{ void print_query_result_offset(uint32_t *mask,
void print_query_result_offset(uint32_t *mask,
                               uint32_t mask_len,
                               uint32_t *vids,
                               struct gqt_query *q,
                               uint32_t **counts,
                               uint32_t *id_lens,
                               uint32_t *U_R,
                               uint32_t U_R_len,
                               char **id_query_list,
                               char **gt_query_list,
                               uint32_t num_qs,
                               uint32_t num_fields,
                               char *off_file_name,
                               char *source_file,
                               char *full_cmd)
{
    struct off_file *off_f = open_off_file(off_file_name);
    struct bcf_file bcf_f = init_bcf_file(source_file);

    char *sample_names = NULL;

    uint32_t i,j,k,line_idx,bytes, bit_i = 0;
    int r;
    for (i = 0; i < U_R_len; ++i) {
        if (i == 0 )
            r = asprintf(&sample_names,
                         "%s",
                         bcf_f.hdr->samples[U_R[i]]);
        else
            r = asprintf(&sample_names,
                         "%s,%s",
                         sample_names,
                         bcf_f.hdr->samples[U_R[i]]);
        if (r == -1)
            err(EX_OSERR, "asprintf error");
    }

    if (bcf_hdr_set_samples(bcf_f.hdr, sample_names, 0) != 0)
        errx(EX_DATAERR, "Error setting samples: %s\n", source_file);

    char *info_s;
 
    for (i = 0; i < num_qs; i++) {
        if ( q[i].variant_op == p_count ) {
            r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Integer,"
                         "Description=\"GQT count result from "
                         "phenotype:'%s' genotype:'%s'\">",
                         i, id_query_list[i], gt_query_list[i]);
            if (r == -1) err(EX_OSERR, "asprintf error");

            if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
                errx(EX_DATAERR, "Error updating header: %s\n", source_file);

        } else if ( q[i].variant_op == p_pct ) {
            r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Float,"
                         "Description=\"GQT percent result from "
                         "phenotype:'%s' genotype:'%s'\">",
                         i, id_query_list[i], gt_query_list[i]);
            if (r == -1) err(EX_OSERR, "asprintf error");

            if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
                errx(EX_DATAERR, "Error updating header: %s\n", source_file);

        } else if ( q[i].variant_op == p_maf ) {
            r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Float,"
                         "Description=\"GQT maf result from "
                         "phenotype:'%s' genotype:'%s'\">",
                         i, id_query_list[i], gt_query_list[i]);

            if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
                errx(EX_DATAERR, "Error updating header: %s\n", source_file);
        }

    }

    r = asprintf(&info_s, "##%s_queryVersion=%s", PROGRAM_NAME, VERSION);
    if (r == -1) err(EX_OSERR, "asprintf error");

    if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
        errx(EX_DATAERR, "Error updating header: %s\n", source_file);

    r = asprintf(&info_s, "##%s_queryCommand=%s", PROGRAM_NAME, full_cmd);
    if (r == -1) err(EX_OSERR, "asprintf error");

    if (bcf_hdr_append(bcf_f.hdr, info_s) != 0)
        errx(EX_DATAERR, "Error updating header: %s\n", source_file);

    htsFile *out_f = hts_open("-","w");
    if ( !out_f )
        err(EX_DATAERR, "Could open output file");

    bcf_hdr_write(out_f, bcf_f.hdr);

    bcf_f.line = bcf_init1();

    for (i=0; i < mask_len; ++i) {
        bytes = mask[i];
	if (bytes == 0)
            continue; /* skip a bunch of ops if you can */
        for (j=0; j < 32; j++) {
            if (bytes & 1 << (31 - j)) {
	        line_idx = i*32+j;

                r = goto_bcf_line(&bcf_f, off_f, line_idx);

                if (r == -1) 
                    err(EX_NOINPUT,
                        "Error seeking file '%s'", bcf_f.file_name);

                r = get_bcf_line(&bcf_f);
                if (r == -1) 
                    err(EX_NOINPUT,
                        "Error reading file '%s'", bcf_f.file_name);

                for (k=0; k < num_qs; k++) {
                    r = asprintf(&info_s, "GQT_%u", k);
                    if (r == -1)
                        err(EX_OSERR, "asprintf error");

                    if ( q[k].variant_op == p_count ) {
                        int32_t v = counts[k][line_idx];
                        if (bcf_update_info_int32(bcf_f.hdr,
                                                  bcf_f.line,
                                                  info_s,
                                                  &v,
                                                  1) != 0)
                            errx(EX_DATAERR,
                                 "Error adding to info field: %s\n",
                                 bcf_f.file_name);
                    } else if (q[k].variant_op == p_pct) {
                        float v = ((float)counts[k][line_idx])/
                                    ((float) id_lens[k]);
                        if (bcf_update_info_float(bcf_f.hdr,
                                                  bcf_f.line,
                                                  info_s,
                                                  &v,
                                                  1) != 0)
                            errx(EX_DATAERR,
                                 "Error adding to info field: %s\n",
                                 bcf_f.file_name);

                    } else if (q[k].variant_op == p_maf) {
                        float v = ((float)counts[k][line_idx])/
                                    (((float) id_lens[k])*2.0);
                        if (bcf_update_info_float(bcf_f.hdr,
                                                  bcf_f.line,
                                                  info_s,
                                                  &v,
                                                  1) != 0)
                            errx(EX_DATAERR,
                                 "Error adding to info field: %s\n",
                                 bcf_f.file_name);
                    }
                }

                bcf_write(out_f, bcf_f.hdr, bcf_f.line);

            }
	    bit_i++;
	    if (bit_i == num_fields)
	        break;
        }

        if (bit_i == num_fields)
            break;
    }
    hts_close(out_f);
    destroy_off_file(off_f);
}
Exemplo n.º 4
0
static void init_data(args_t *args)
{
    args->prev_rid = args->skip_rid = -1;
    args->hdr = args->files->readers[0].header;

    if ( !args->sample )
    {
        if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n");
        args->sample = strdup(args->hdr->samples[0]);
    }
    if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");

    // Set samples
    kstring_t str = {0,0,0};
    if ( args->estimate_AF && strcmp("-",args->estimate_AF) )
    {
        int i, n;
        char **smpls = hts_readlist(args->estimate_AF, 1, &n);

        // Make sure the query sample is included
        for (i=0; i<n; i++)
            if ( !strcmp(args->sample,smpls[i]) ) break;

        // Add the query sample if not present
        if ( i!=n ) kputs(args->sample, &str);

        for (i=0; i<n; i++)
        {
            if ( str.l ) kputc(',', &str);
            kputs(smpls[i], &str);
            free(smpls[i]);
        }
        free(smpls);
    }
    else if ( !args->estimate_AF )
        kputs(args->sample, &str);

    if ( str.l )
    {
        int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
        if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
        else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
    }

    if ( args->af_tag )
        if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
            error("No such INFO tag in the VCF: %s\n", args->af_tag);

    args->nsmpl = bcf_hdr_nsamples(args->hdr);
    args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
    free(str.s);

    int i;
    for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.);

    // Init transition matrix and HMM
    double tprob[4];
    MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
    MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW;
    MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
    MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; 

    if ( args->genmap_fname ) 
    {
        args->hmm = hmm_init(2, tprob, 0);
        hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
    }
    else if ( args->rec_rate > 0 )
    {
        args->hmm = hmm_init(2, tprob, 0);
        hmm_set_tprob_func(args->hmm, set_tprob_recrate, args);

    }
    else
        args->hmm = hmm_init(2, tprob, 10000);

    // print header
    printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
    printf("# The command line was:\tbcftools %s", args->argv[0]);
    for (i=1; i<args->argc; i++)
        printf(" %s",args->argv[i]);
    printf("\n#\n");
    printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
}
Exemplo n.º 5
0
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{
    char *trio_samples = NULL, *unrelated_samples = NULL;
    memset(&args,0,sizeof(args_t));
    args.prev_rid = -1;
    args.hdr = in;
    args.pij = 2e-8;
    args.pgt_err = 1e-9;

    static struct option loptions[] =
    {
        {"prefix",1,0,'p'},
        {"trio",1,0,'t'},
        {"unrelated",1,0,'u'},
        {0,0,0,0}
    };
    int c;
    while ((c = getopt_long(argc, argv, "?ht:u:p:",loptions,NULL)) >= 0)
    {
        switch (c) 
        {
            case 'p': args.prefix = optarg; break;
            case 't': trio_samples = optarg; break;
            case 'u': unrelated_samples = optarg; break;
            case 'h':
            case '?':
            default: error("%s", usage()); break;
        }
    }
    if ( optind != argc ) error(usage());
    if ( trio_samples && unrelated_samples ) error("Expected only one of the -t/-u options\n");
    if ( !trio_samples && !unrelated_samples ) error("Expected one of the -t/-u options\n");
    if ( !args.prefix ) error("Expected the -p option\n");

    int ret = bcf_hdr_set_samples(args.hdr, trio_samples ? trio_samples : unrelated_samples, 0);
    if ( ret<0 ) error("Could not parse samples: %s\n", trio_samples ? trio_samples : unrelated_samples);
    else if ( ret>0 ) error("%d-th sample not found: %s\n", ret,trio_samples ? trio_samples : unrelated_samples);

    if ( trio_samples )
    {
        int i,n = 0;
        char **list = hts_readlist(trio_samples, 0, &n);
        if ( n!=3 ) error("Expected three sample names with -t\n");
        args.imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]);
        args.ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]);
        args.ichild  = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]);
        for (i=0; i<n; i++) free(list[i]);
        free(list);
        args.set_observed_prob = set_observed_prob_trio;
        args.mode = C_TRIO;
        init_hmm_trio(&args);
    }
    else
    {
        int i,n = 0;
        char **list = hts_readlist(unrelated_samples, 0, &n);
        if ( n!=2 ) error("Expected two sample names with -u\n");
        args.isample = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]);
        args.jsample = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]);
        for (i=0; i<n; i++) free(list[i]);
        free(list);
        args.set_observed_prob = set_observed_prob_unrelated;
        args.mode = C_UNRL;
        init_hmm_unrelated(&args);
    }
    return 1;
}