Example #1
0
int parse_format_flag(const char *str)
{
    int i, flag = 0, n_tags;
    char **tags = hts_readlist(str, 0, &n_tags);
    for(i=0; i<n_tags; i++)
    {
        if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP;
        else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); }
        else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP;
        else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); }
        else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); }
        else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); }
        else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD;
        else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF;
        else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR;
        else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD;
        else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF;
        else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR;
        else
        {
            fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
            exit(EXIT_FAILURE);
        }
        free(tags[i]);
    }
    if (n_tags) free(tags);
    return flag;
}
Example #2
0
int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file)
{
    if ( list[0]!='^' ) bsmpl->sample_logic = 1;
    else list++;

    int i, nsamples = 0;
    char **samples = hts_readlist(list, is_file, &nsamples);
    if ( !nsamples ) return 0;

    kstring_t ori = {0,0,0};
    kstring_t ren = {0,0,0};

    bsmpl->sample_list = khash_str2str_init();
    for (i=0; i<nsamples; i++)
    {
        char *ptr = samples[i];
        ori.l = ren.l = 0;
        int escaped = 0;
        while ( *ptr )
        {
            if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
            if ( isspace(*ptr) && !escaped ) break;
            kputc(*ptr, &ori);
            escaped = 0;
            ptr++;
        }
        if ( *ptr )
        {
            while ( *ptr && isspace(*ptr) ) ptr++;
            while ( *ptr )
            {
                if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
                if ( isspace(*ptr) && !escaped ) break;
                kputc(*ptr, &ren);
                escaped = 0;
                ptr++;
            }
        }
        khash_str2str_set(bsmpl->sample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s));
        free(samples[i]);
    }
    free(samples);
    free(ori.s);
    free(ren.s);
    return nsamples;
}
Example #3
0
static void init_data(args_t *args)
{
    args->header = args->files->readers[0].header;

    int i, nsamples = 0, *samples = NULL;
    if ( args->sample_list && strcmp("-",args->sample_list) )
    {
        for (i=0; i<args->files->nreaders; i++)
        {
            int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file);
            if ( ret<0 ) error("Error parsing the sample list\n");
            else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret);
        }

        if ( args->sample_list[0]!='^' )
        {
            // the sample ordering may be different if not negated
            int n;
            char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n);
            if ( !smpls ) error("Could not parse %s\n", args->sample_list);
            if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) )
                error("The number of samples does not match, perhaps some are present multiple times?\n");
            nsamples = bcf_hdr_nsamples(args->files->readers[0].header);
            samples = (int*) malloc(sizeof(int)*nsamples);
            for (i=0; i<n; i++)
            {
                samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]);
                free(smpls[i]);
            }
            free(smpls);
        }
    }
    args->convert = convert_init(args->header, samples, nsamples, args->format_str);
    if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1);
    free(samples);

    int max_unpack = convert_max_unpack(args->convert);
    if ( args->filter_str )
    {
        args->filter = filter_init(args->header, args->filter_str);
        max_unpack |= filter_max_unpack(args->filter);
    }
    args->files->max_unpack = max_unpack;
}
Example #4
0
int parse_format_flag(const char *str)
{
    int i, flag = 0, n_tags;
    char **tags = hts_readlist(str, 0, &n_tags);
    for(i=0; i<n_tags; i++)
    {
        if ( !strcasecmp(tags[i],"DP") ) flag |= B2B_FMT_DP;
        else if ( !strcasecmp(tags[i],"DV") ) flag |= B2B_FMT_DV;
        else if ( !strcasecmp(tags[i],"SP") ) flag |= B2B_FMT_SP;
        else if ( !strcasecmp(tags[i],"DP4") ) flag |= B2B_FMT_DP4;
        else if ( !strcasecmp(tags[i],"DPR") ) flag |= B2B_FMT_DPR;
        else if ( !strcasecmp(tags[i],"INFO/DPR") ) flag |= B2B_INFO_DPR;
        else
        {
            fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str);
            exit(1);
        }
        free(tags[i]);
    }
    if (n_tags) free(tags);
    return flag;
}
Example #5
0
int parse_tags(args_t *args, const char *str)
{
    int i, flag = 0, n_tags;
    char **tags = hts_readlist(str, 0, &n_tags);
    for(i=0; i<n_tags; i++)
    {
        if ( !strcasecmp(tags[i],"AN") ) flag |= SET_AN;
        else if ( !strcasecmp(tags[i],"AC") ) flag |= SET_AC;
        else if ( !strcasecmp(tags[i],"NS") ) flag |= SET_NS;
        else if ( !strcasecmp(tags[i],"AC_Hom") ) flag |= SET_AC_Hom;
        else if ( !strcasecmp(tags[i],"AC_Het") ) flag |= SET_AC_Het;
        else if ( !strcasecmp(tags[i],"AC_Hemi") ) flag |= SET_AC_Hemi;
        else if ( !strcasecmp(tags[i],"AF") ) flag |= SET_AF;
        else
        {
            fprintf(stderr,"Error parsing \"--tags %s\": the tag \"%s\" is not supported\n", str,tags[i]);
            exit(1);
        }
        free(tags[i]);
    }
    if (n_tags) free(tags);
    return flag;
}
Example #6
0
static void list_columns(args_t *args)
{
    void *has_sample = NULL;
    if ( args->sample_list )
    {
        has_sample = khash_str2int_init();
        int i, nsmpl;
        char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl);
        for (i=0; i<nsmpl; i++) khash_str2int_inc(has_sample, smpl[i]);
        free(smpl);
    }

    int i;
    bcf_sr_t *reader = &args->files->readers[0];
    for (i=0; i<bcf_hdr_nsamples(reader->header); i++)
    {
        if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue;
        printf("%s\n", reader->header->samples[i]);
    }

    if ( has_sample )
        khash_str2int_destroy_free(has_sample);
}
Example #7
0
int run(int argc, char **argv)
{
    char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL;
    memset(&args,0,sizeof(args_t));
    args.mode = 0;
    args.output_fname = "-";

    static struct option loptions[] =
    {
        {"trio",1,0,'t'},
        {"trio-file",1,0,'T'},
        {"delete",0,0,'d'},
        {"list",1,0,'l'},
        {"count",0,0,'c'},
        {"rules",1,0,'r'},
        {"rules-file",1,0,'R'},
        {"output",required_argument,NULL,'o'},
        {"output-type",required_argument,NULL,'O'},
        {0,0,0,0}
    };
    int c;
    while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0)
    {
        switch (c) 
        {
            case 'o': args.output_fname = optarg; break;
            case 'O':
                      switch (optarg[0]) {
                          case 'b': args.output_type = FT_BCF_GZ; break;
                          case 'u': args.output_type = FT_BCF; break;
                          case 'z': args.output_type = FT_VCF_GZ; break;
                          case 'v': args.output_type = FT_VCF; break;
                          default: error("The output type \"%s\" not recognised\n", optarg);
                      };
                      break;
            case 'R': rules_fname = optarg; break;
            case 'r': rules_string = optarg; break;
            case 'd': args.mode |= MODE_DELETE; break;
            case 'c': args.mode |= MODE_COUNT; break;
            case 'l': 
                if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; 
                else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; 
                else error("The argument not recognised: --list %s\n", optarg);
                break;
            case 't': trio_samples = optarg; break;
            case 'T': trio_file = optarg; break;
            case 'h':
            case '?':
            default: error("%s",usage()); break;
        }
    }
    if ( rules_fname )
        args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args);
    else
        args.rules = init_rules(&args, rules_string);
    if ( !args.rules ) return -1;
    args.itr     = regitr_init(args.rules);
    args.itr_ori = regitr_init(args.rules);

    char *fname = NULL;
    if ( optind>=argc || argv[optind][0]=='-' )
    {
        if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";  // reading from stdin
        else error("%s",usage());
    }
    else
        fname = argv[optind];

    if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n");
    if ( !args.mode ) error("Expected one of the -c, -d or -l options\n");
    if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD;

    args.sr = bcf_sr_init();
    if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum));
    args.hdr = bcf_sr_get_header(args.sr, 0);
    args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type));
    if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno));
    bcf_hdr_write(args.out_fh, args.hdr);


    int i, n = 0;
    char **list;
    if ( trio_samples )
    {
        args.ntrios = 1;
        args.trios = (trio_t*) calloc(1,sizeof(trio_t));
        list = hts_readlist(trio_samples, 0, &n);
        if ( n!=3 ) error("Expected three sample names with -t\n");
        args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]);
        args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]);
        args.trios[0].ichild  = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]);
        for (i=0; i<n; i++) free(list[i]);
        free(list);
    }
    if ( trio_file )
    {
        list = hts_readlist(trio_file, 1, &n);
        args.ntrios = n;
        args.trios = (trio_t*) calloc(n,sizeof(trio_t));
        for (i=0; i<n; i++)
        {
            char *ss = list[i], *se;
            se = strchr(ss, ',');
            if ( !se ) error("Could not parse %s: %s\n",trio_file, ss);
            *se = 0;
            args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
            if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss);
            ss = ++se; 
            se = strchr(ss, ',');
            if ( !se ) error("Could not parse %s\n",trio_file);
            *se = 0;
            args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
            if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss);
            ss = ++se; 
            if ( *ss=='\0' ) error("Could not parse %s\n",trio_file);
            args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss);
            if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss);
            free(list[i]);
        }
        free(list);
    }

    while ( bcf_sr_next_line(args.sr) )
    {
        bcf1_t *line = bcf_sr_get_line(args.sr,0);
        line = process(line);
        if ( line )
        {
            if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode);
            bcf_write1(args.out_fh, args.hdr, line);
        }
    }


    fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n");
    for (i=0; i<args.ntrios; i++)
    {
        trio_t *trio = &args.trios[i];
        fprintf(stderr,"%d\t%d\t%d\t%s,%s,%s\n", 
            trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad),
            bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother),
            bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather),
            bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild)
            );
    }
    free(args.gt_arr);
    free(args.trios);
    regitr_destroy(args.itr);
    regitr_destroy(args.itr_ori);
    regidx_destroy(args.rules);
    bcf_sr_destroy(args.sr);
    if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n");
    return 0;
}
int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
{
    int i, j, nsmpl, free_smpl = 0;
    char **smpl = NULL;

    void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL;
    if ( exclude || strcmp("-",fname) ) // "-" stands for all samples
    {
        smpl = hts_readlist(fname, is_file, &nsmpl);
        if ( !smpl )
        {
            fprintf(stderr,"Could not read the file: \"%s\"\n", fname);
            return 0;
        }
        if ( exclude )
        {
            for (i=0; i<nsmpl; i++)
                khash_str2int_inc(exclude, smpl[i]);
        }
        free_smpl = 1;
    }
    if ( !smpl )
    {
        smpl  = files->readers[0].header->samples;   // intersection of all samples
        nsmpl = bcf_hdr_nsamples(files->readers[0].header);
    }

    files->samples = NULL;
    files->n_smpl  = 0;
    for (i=0; i<nsmpl; i++)
    {
        if ( exclude && khash_str2int_has_key(exclude,smpl[i])  ) continue;

        int n_isec = 0;
        for (j=0; j<files->nreaders; j++)
        {
            if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break;
            n_isec++;
        }
        if ( n_isec!=files->nreaders )
        {
            fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname);
            continue;
        }

        files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*));
        files->samples[files->n_smpl++] = strdup(smpl[i]);
    }

    if ( exclude ) khash_str2int_destroy(exclude);
    if ( free_smpl )
    {
        for (i=0; i<nsmpl; i++) free(smpl[i]);
        free(smpl);
    }

    if ( !files->n_smpl )
    {
        if ( files->nreaders>1 )
            fprintf(stderr,"No samples in common.\n");
        return 0;
    }
    for (i=0; i<files->nreaders; i++)
    {
        bcf_sr_t *reader = &files->readers[i];
        reader->samples  = (int*) malloc(sizeof(int)*files->n_smpl);
        reader->n_smpl   = files->n_smpl;
        for (j=0; j<files->n_smpl; j++)
            reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]);
    }
    return 1;
}
Example #9
0
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{
    memset(&args,0,sizeof(args_t));

    int i;

    static struct option loptions[] =
    {
        {"help",            no_argument,       0,'h'},
        {"sample-list",     required_argument, 0,'s'},
        {0,0,0,0}
    };

    char **smps_strs = NULL;

    int c;
    while ((c = getopt_long(argc, argv, "?s:h",loptions,NULL)) >= 0)
    {
        switch (c)
        {
            case 's': smps_strs = hts_readlist(optarg,0,&(args.n_sel_smps));
                      if ( args.n_sel_smps == 0 )
                      {
                          fprintf(stderr, "Sample specification not valid.\n");
                          error("%s", usage());
                      }
                      break;
            case 'h': usage(); break;
            case '?':
            default: error("%s", usage()); break;
        }
    }
    if ( optind != argc )  usage();  // too many files given

    args.hdr = bcf_hdr_dup(in);

    // Samples parsing from header and input option
    if ( !bcf_hdr_nsamples(args.hdr) )
    {
        error("No samples in input file.\n");
    }
    args.nsmp = bcf_hdr_nsamples(args.hdr);
    args.selected_smps = (int*) calloc(args.nsmp,sizeof(int));
    for ( i = 0; i < args.n_sel_smps; i++ )
    {
        int ind = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, smps_strs[i]);
        if ( ind == -1 )
        {
            error("Sample '%s' not in input vcf file.\n", smps_strs[i]);
        } else {
            args.selected_smps[ind] = 1;
        }
        free(smps_strs[i]);
    }
    free(smps_strs);

    /*
    fprintf(stderr, "Selected samples array:[");
    for (i=0;i<args.nsmp;i++)
    {
        fprintf(stderr, " %i", args.selected_smps[i]);
    }
    fprintf(stderr, " ]\n");
    */

    if ( bcf_hdr_id2int(args.hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header\n", __func__);

    args.gt_arr = NULL;

    return 0;
}
Example #10
0
static void init_data(args_t *args)
{
    int i;
    args->hdr = args->files->readers[0].header;

    if (args->calc_ac && args->update_info)
    {
        bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">");
        bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">");
    }
    bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view");

    // setup sample data
    if (args->sample_names)
    {
        void *hdr_samples = khash_str2int_init();
        for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
            khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));

        void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL;
        int nsmpl;
        char **smpl = NULL;
        args->samples = NULL; args->n_samples = 0;
        smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl);
        if ( !smpl )
        {
            error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names);
        }

        if ( exclude )
        {
            for (i=0; i<nsmpl; i++) {
                if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
                    if (args->force_samples) {
                        fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
                    } else {
                        error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
                    }
                }
                khash_str2int_inc(exclude, smpl[i]);
            }

            for (i=0; i<bcf_hdr_nsamples(args->hdr); i++)
            {
                if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i))  ) continue;
                args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
                args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i));
            }
            khash_str2int_destroy(exclude);
        }
        else
        {
            for (i=0; i<nsmpl; i++) {
                if (!khash_str2int_has_key(hdr_samples,smpl[i])) {
                    if (args->force_samples) {
                        fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]);
                        continue;
                    } else {
                        error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]);
                    }
                }
                args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*));
                args->samples[args->n_samples++] = strdup(smpl[i]);
            }
        }
        for (i=0; i<nsmpl; i++) free(smpl[i]);
        free(smpl);
        khash_str2int_destroy(hdr_samples);
        if (args->n_samples == 0) {
            fprintf(stderr, "Warn: subsetting has removed all samples\n");
            args->sites_only = 1;
        }
    }

    if (args->n_samples)
        args->imap = (int*)malloc(args->n_samples * sizeof(int));

    // determine variant types to include/exclude
    if (args->include_types || args->exclude_types) {
        if (args->include_types && args->exclude_types) {
            fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n");
            exit(1);
        }
        char **type_list = 0;
        int m = 0, n = 0;
        const char *q, *p;
        for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) {
            if (*p == ',' || *p == 0) {
                if (m == n) {
                    m = m? m<<1 : 16;
                    type_list = (char**)realloc(type_list, m * sizeof(char*));
                }
                type_list[n] = (char*)calloc(p - q + 1, 1);
                strncpy(type_list[n++], q, p - q);
                q = p + 1;
                if (*p == 0) break;
            }
        }
        type_list = (char**)realloc(type_list, n * sizeof(char*));

        if (args->include_types) {
            args->include = 0;
            for (i = 0; i < n; ++i) {
                if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP;
                else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL;
                else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP;
                else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER;
                else {
                    fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
                    fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
                    exit(1);
                }
            }
        }
        if (args->exclude_types) {
            args->exclude = 0;
            for (i = 0; i < n; ++i) {
                if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP;
                else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL;
                else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP;
                else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER;
                else {
                    fprintf(stderr, "[E::%s] unknown type\n", type_list[i]);
                    fprintf(stderr, "Accepted types are snps, indels, mnps, other\n");
                    exit(1);
                }
            }
        }
        for (i = 0; i < n; ++i)
            free(type_list[i]);
        free(type_list);
    }

    // setup output
    char modew[8];
    strcpy(modew, "w");
    if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel);
    if (args->output_type==FT_BCF) strcat(modew, "bu");         // uncompressed BCF
    else if (args->output_type & FT_BCF) strcat(modew, "b");    // compressed BCF
    else if (args->output_type & FT_GZ) strcat(modew,"z");      // compressed VCF
    args->out = hts_open(args->fn_out ? args->fn_out : "-", modew);
    if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));

    // headers: hdr=full header, hsub=subset header, hnull=sites only header
    if (args->sites_only){
        args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0);
        bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL);
    }
    if (args->n_samples > 0)
    {
        args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap);
        if ( !args->hsub ) error("Error occurred while subsetting samples\n");
        if ( args->n_samples != bcf_hdr_nsamples(args->hsub) )
        {
            int i;
            for (i=0; i<args->n_samples; i++)
                if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]);
        }
    }

    if ( args->filter_str )
        args->filter = filter_init(args->hdr, args->filter_str);
}
Example #11
0
static void init_data(args_t *args)
{
    args->prev_rid = args->skip_rid = -1;
    args->hdr = args->files->readers[0].header;

    if ( !args->sample )
    {
        if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n");
        args->sample = strdup(args->hdr->samples[0]);
    }
    if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n");

    // Set samples
    kstring_t str = {0,0,0};
    if ( args->estimate_AF && strcmp("-",args->estimate_AF) )
    {
        int i, n;
        char **smpls = hts_readlist(args->estimate_AF, 1, &n);

        // Make sure the query sample is included
        for (i=0; i<n; i++)
            if ( !strcmp(args->sample,smpls[i]) ) break;

        // Add the query sample if not present
        if ( i!=n ) kputs(args->sample, &str);

        for (i=0; i<n; i++)
        {
            if ( str.l ) kputc(',', &str);
            kputs(smpls[i], &str);
            free(smpls[i]);
        }
        free(smpls);
    }
    else if ( !args->estimate_AF )
        kputs(args->sample, &str);

    if ( str.l )
    {
        int ret = bcf_hdr_set_samples(args->hdr, str.s, 0);
        if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s);
        else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret);
    }

    if ( args->af_tag )
        if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) )
            error("No such INFO tag in the VCF: %s\n", args->af_tag);

    args->nsmpl = bcf_hdr_nsamples(args->hdr);
    args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample);
    free(str.s);

    int i;
    for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.);

    // Init transition matrix and HMM
    double tprob[4];
    MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ;
    MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW;
    MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ;
    MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; 

    if ( args->genmap_fname ) 
    {
        args->hmm = hmm_init(2, tprob, 0);
        hmm_set_tprob_func(args->hmm, set_tprob_genmap, args);
    }
    else if ( args->rec_rate > 0 )
    {
        args->hmm = hmm_init(2, tprob, 0);
        hmm_set_tprob_func(args->hmm, set_tprob_recrate, args);

    }
    else
        args->hmm = hmm_init(2, tprob, 10000);

    // print header
    printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version());
    printf("# The command line was:\tbcftools %s", args->argv[0]);
    for (i=1; i<args->argc; i++)
        printf(" %s",args->argv[i]);
    printf("\n#\n");
    printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n");
}
Example #12
0
int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file)
{
    if ( list[0]!='^' ) bsmpl->rg_logic = 1;
    else list++;

    int i, nrows  = 0;
    char **rows = hts_readlist(list, is_file, &nrows);
    if ( !nrows ) return 0;

    kstring_t fld1 = {0,0,0};
    kstring_t fld2 = {0,0,0};
    kstring_t fld3 = {0,0,0};

    bsmpl->rg_list = khash_str2str_init();
    for (i=0; i<nrows; i++)
    {
        char *ptr = rows[i];
        fld1.l = fld2.l = fld3.l = 0;
        int escaped = 0;
        while ( *ptr )
        {
            if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
            if ( isspace(*ptr) && !escaped ) break;
            kputc(*ptr, &fld1);
            escaped = 0;
            ptr++;
        }
        if ( *ptr )
        {
            while ( *ptr && isspace(*ptr) ) ptr++;
            while ( *ptr )
            {
                if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
                if ( isspace(*ptr) && !escaped ) break;
                kputc(*ptr, &fld2);
                escaped = 0;
                ptr++;
            }
        }
        if ( *ptr )
        {
            while ( *ptr && isspace(*ptr) ) ptr++;
            while ( *ptr )
            {
                if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; }
                if ( isspace(*ptr) && !escaped ) break;
                kputc(*ptr, &fld3);
                escaped = 0;
                ptr++;
            }
        }
        if ( fld3.l )
        {
            // ID FILE SAMPLE
            kputc('\t',&fld1);
            kputs(fld2.s,&fld1);
            fld2.l = 0;
            kputs(fld3.s,&fld2);
        }
        // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name
        char *value = khash_str2str_get(bsmpl->rg_list,fld1.s);
        if ( !value )
            khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t"));
        else if ( strcmp(value,fld2.l?fld2.s:"\t") )
            error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t");
        free(rows[i]);
    }
    free(rows);
    free(fld1.s);
    free(fld2.s);
    free(fld3.s);
    return nrows;
}
Example #13
0
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out)
{
    char *trio_samples = NULL, *unrelated_samples = NULL;
    memset(&args,0,sizeof(args_t));
    args.prev_rid = -1;
    args.hdr = in;
    args.pij = 2e-8;
    args.pgt_err = 1e-9;

    static struct option loptions[] =
    {
        {"prefix",1,0,'p'},
        {"trio",1,0,'t'},
        {"unrelated",1,0,'u'},
        {0,0,0,0}
    };
    int c;
    while ((c = getopt_long(argc, argv, "?ht:u:p:",loptions,NULL)) >= 0)
    {
        switch (c) 
        {
            case 'p': args.prefix = optarg; break;
            case 't': trio_samples = optarg; break;
            case 'u': unrelated_samples = optarg; break;
            case 'h':
            case '?':
            default: error("%s", usage()); break;
        }
    }
    if ( optind != argc ) error(usage());
    if ( trio_samples && unrelated_samples ) error("Expected only one of the -t/-u options\n");
    if ( !trio_samples && !unrelated_samples ) error("Expected one of the -t/-u options\n");
    if ( !args.prefix ) error("Expected the -p option\n");

    int ret = bcf_hdr_set_samples(args.hdr, trio_samples ? trio_samples : unrelated_samples, 0);
    if ( ret<0 ) error("Could not parse samples: %s\n", trio_samples ? trio_samples : unrelated_samples);
    else if ( ret>0 ) error("%d-th sample not found: %s\n", ret,trio_samples ? trio_samples : unrelated_samples);

    if ( trio_samples )
    {
        int i,n = 0;
        char **list = hts_readlist(trio_samples, 0, &n);
        if ( n!=3 ) error("Expected three sample names with -t\n");
        args.imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]);
        args.ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]);
        args.ichild  = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]);
        for (i=0; i<n; i++) free(list[i]);
        free(list);
        args.set_observed_prob = set_observed_prob_trio;
        args.mode = C_TRIO;
        init_hmm_trio(&args);
    }
    else
    {
        int i,n = 0;
        char **list = hts_readlist(unrelated_samples, 0, &n);
        if ( n!=2 ) error("Expected two sample names with -u\n");
        args.isample = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]);
        args.jsample = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]);
        for (i=0; i<n; i++) free(list[i]);
        free(list);
        args.set_observed_prob = set_observed_prob_unrelated;
        args.mode = C_UNRL;
        init_hmm_unrelated(&args);
    }
    return 1;
}
Example #14
0
int main_vcfquery(int argc, char *argv[])
{
    int c, collapse = 0;
    args_t *args = (args_t*) calloc(1,sizeof(args_t));
    args->argc   = argc; args->argv = argv;
    int regions_is_file = 0, targets_is_file = 0;

    static struct option loptions[] =
    {
        {"help",0,0,'h'},
        {"list-samples",0,0,'l'},
        {"include",1,0,'i'},
        {"exclude",1,0,'e'},
        {"format",1,0,'f'},
        {"output-file",1,0,'o'},
        {"regions",1,0,'r'},
        {"regions-file",1,0,'R'},
        {"targets",1,0,'t'},
        {"targets-file",1,0,'T'},
        {"annots",1,0,'a'},
        {"samples",1,0,'s'},
        {"samples-file",1,0,'S'},
        {"print-header",0,0,'H'},
        {"collapse",1,0,'c'},
        {"vcf-list",1,0,'v'},
        {"allow-undef-tags",0,0,'u'},
        {0,0,0,0}
    };
    while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) {
        switch (c) {
            case 'o': args->fn_out = optarg; break;
            case 'f': args->format_str = strdup(optarg); break;
            case 'H': args->print_header = 1; break;
            case 'v': args->vcf_list = optarg; break;
            case 'c':
                if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS;
                else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS;
                else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS;
                else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY;
                else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY;
                else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME;
                else error("The --collapse string \"%s\" not recognised.\n", optarg);
                break;
            case 'a':
                {
                    kstring_t str = {0,0,0};
                    kputs("%CHROM\t%POS\t%MASK\t%REF\t%ALT\t%", &str);
                    char *p = optarg;
                    while ( *p )
                    {
                        if ( *p==',' )
                            kputs("\t%", &str);
                        else
                            kputc(*p, &str);
                        p++;
                    }
                    kputc('\n', &str);
                    args->format_str = str.s;
                    break;
                }
            case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break;
            case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break;
            case 'r': args->regions_list = optarg; break;
            case 'R': args->regions_list = optarg; regions_is_file = 1; break;
            case 't': args->targets_list = optarg; break;
            case 'T': args->targets_list = optarg; targets_is_file = 1; break;
            case 'l': args->list_columns = 1; break;
            case 'u': args->allow_undef_tags = 1; break;
            case 's': args->sample_list = optarg; break;
            case 'S': args->sample_list = optarg; args->sample_is_file = 1; break;
            case 'h':
            case '?': usage();
            default: error("Unknown argument: %s\n", optarg);
        }
    }

    char *fname = NULL;
    if ( optind>=argc )
    {
        if ( !isatty(fileno((FILE *)stdin)) ) fname = "-";
    }
    else fname = argv[optind];

    if ( args->list_columns )
    {
        if ( !fname ) error("Missing the VCF file name\n");
        args->files = bcf_sr_init();
        if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
        list_columns(args);
        bcf_sr_destroy(args->files);
        free(args);
        return 0;
    }

    if ( !args->format_str ) usage();
    args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout;
    if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno));

    if ( !args->vcf_list )
    {
        if ( !fname ) usage();
        args->files = bcf_sr_init();
        args->files->collapse = collapse;
        if ( optind+1 < argc ) args->files->require_index = 1;
        if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
            error("Failed to read the regions: %s\n", args->regions_list);
        if ( args->targets_list )
        {
            if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 )
                error("Failed to read the targets: %s\n", args->targets_list);
        }
        while ( fname )
        {
            if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum));
            fname = ++optind < argc ? argv[optind] : NULL;
        }
        init_data(args);
        query_vcf(args);
        free(args->format_str);
        destroy_data(args);
        bcf_sr_destroy(args->files);
        fclose(args->out);
        free(args);
        return 0;
    }

    // multiple VCFs
    int i, k, nfiles, prev_nsamples = 0;
    char **fnames, **prev_samples = NULL;
    fnames = hts_readlist(args->vcf_list, 1, &nfiles);
    if ( !nfiles ) error("No files in %s?\n", args->vcf_list);
    for (i=0; i<nfiles; i++)
    {
        args->files = bcf_sr_init();
        args->files->collapse = collapse;
        if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 )
            error("Failed to read the regions: %s\n", args->regions_list);
        if ( optind < argc ) args->files->require_index = 1;
        if ( args->targets_list )
        {
            if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 )
                error("Failed to read the targets: %s\n", args->targets_list);
        }
        if ( !bcf_sr_add_reader(args->files, fnames[i]) ) error("Failed to open %s: %s\n", fnames[i],bcf_sr_strerror(args->files->errnum));
        for (k=optind; k<argc; k++)
            if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum));
        init_data(args);
        if ( i==0 )
            prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header));
        else
        {
            args->print_header = 0;
            if ( compare_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header), prev_samples, prev_nsamples) )
                error("Different samples in %s and %s\n", fnames[i-1],fnames[i]);
        }
        query_vcf(args);
        destroy_data(args);
        bcf_sr_destroy(args->files);
    }
    fclose(args->out);
    destroy_list(fnames, nfiles);
    destroy_list(prev_samples, prev_nsamples);
    free(args->format_str);
    free(args);
    return 0;
}
Example #15
0
/*
 *  Reads sample names and their ploidy (optional) from a file.
 *  Alternatively, if no such file exists, the file name is interpreted
 *  as a comma-separated list of samples. When ploidy is not present,
 *  the default ploidy 2 is assumed.
 */
static void set_samples(args_t *args, const char *fn, int is_file)
{
    int i, nlines;
    char **lines = hts_readlist(fn, is_file, &nlines);
    if ( !lines ) error("Could not read the file: %s\n", fn);

    int nsmpls;
    char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls);
    if ( smpls )
    {
        for (i=0; i<nlines; i++) free(lines[i]);
        free(lines);
        lines = smpls;
        nlines = nsmpls;
    }

    args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting
    args->sample2sex  = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
    int dflt_sex_id = ploidy_add_sex(args->ploidy, "F");
    for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id;

    int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr));
    for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) old2new[i] = -1;

    int nsmpl = 0, map_needed = 0;
    for (i=0; i<nlines; i++)
    {
        char *ss = lines[i];
        while ( *ss && isspace(*ss) ) ss++;
        if ( !*ss ) error("Could not parse: %s\n", lines[i]);
        if ( *ss=='#' ) continue;
        char *se = ss;
        while ( *se && !isspace(*se) ) se++;
        char x = *se, *xptr = se; *se = 0;

        int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss);
        if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; }

        ss = se+1;
        while ( *ss && isspace(*ss) ) ss++;
        if ( !*ss ) ss = "2";   // default ploidy
        se = ss;
        while ( *se && !isspace(*se) ) se++;
        if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); }

        if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') )
            args->sample2sex[nsmpl] = -1*(ss[0]-'0');
        else
            args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss);

        if ( ismpl!=nsmpl ) map_needed = 1;
        args->samples_map[nsmpl] = ismpl;
        old2new[ismpl] = nsmpl;
        nsmpl++;
    }

    for (i=0; i<args->aux.nfams; i++)
    {
        int j, nmiss = 0;
        family_t *fam = &args->aux.fams[i];
        for (j=0; j<3; j++)
        {
            fam->sample[i] = old2new[fam->sample[i]];
            if ( fam->sample[i]<0 ) nmiss++;
        }
        assert( nmiss==0 || nmiss==3 );
    }
    free(old2new);

    if ( !map_needed ) { free(args->samples_map); args->samples_map = NULL; }

    args->nsamples = nsmpl;
    args->samples = lines;
}