int parse_format_flag(const char *str) { int i, flag = 0, n_tags; char **tags = hts_readlist(str, 0, &n_tags); for(i=0; i<n_tags; i++) { if ( !strcasecmp(tags[i],"DP") || !strcasecmp(tags[i],"FORMAT/DP") || !strcasecmp(tags[i],"FMT/DP") ) flag |= B2B_FMT_DP; else if ( !strcasecmp(tags[i],"DV") || !strcasecmp(tags[i],"FORMAT/DV") || !strcasecmp(tags[i],"FMT/DV") ) { flag |= B2B_FMT_DV; fprintf(stderr, "[warning] tag DV functional, but deprecated. Please switch to `AD` in future.\n"); } else if ( !strcasecmp(tags[i],"SP") || !strcasecmp(tags[i],"FORMAT/SP") || !strcasecmp(tags[i],"FMT/SP") ) flag |= B2B_FMT_SP; else if ( !strcasecmp(tags[i],"DP4") || !strcasecmp(tags[i],"FORMAT/DP4") || !strcasecmp(tags[i],"FMT/DP4") ) { flag |= B2B_FMT_DP4; fprintf(stderr, "[warning] tag DP4 functional, but deprecated. Please switch to `ADF` and `ADR` in future.\n"); } else if ( !strcasecmp(tags[i],"DPR") || !strcasecmp(tags[i],"FORMAT/DPR") || !strcasecmp(tags[i],"FMT/DPR") ) { flag |= B2B_FMT_DPR; fprintf(stderr, "[warning] tag DPR functional, but deprecated. Please switch to `AD` in future.\n"); } else if ( !strcasecmp(tags[i],"INFO/DPR") ) { flag |= B2B_INFO_DPR; fprintf(stderr, "[warning] tag INFO/DPR functional, but deprecated. Please switch to `INFO/AD` in future.\n"); } else if ( !strcasecmp(tags[i],"AD") || !strcasecmp(tags[i],"FORMAT/AD") || !strcasecmp(tags[i],"FMT/AD") ) flag |= B2B_FMT_AD; else if ( !strcasecmp(tags[i],"ADF") || !strcasecmp(tags[i],"FORMAT/ADF") || !strcasecmp(tags[i],"FMT/ADF") ) flag |= B2B_FMT_ADF; else if ( !strcasecmp(tags[i],"ADR") || !strcasecmp(tags[i],"FORMAT/ADR") || !strcasecmp(tags[i],"FMT/ADR") ) flag |= B2B_FMT_ADR; else if ( !strcasecmp(tags[i],"INFO/AD") ) flag |= B2B_INFO_AD; else if ( !strcasecmp(tags[i],"INFO/ADF") ) flag |= B2B_INFO_ADF; else if ( !strcasecmp(tags[i],"INFO/ADR") ) flag |= B2B_INFO_ADR; else { fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str); exit(EXIT_FAILURE); } free(tags[i]); } if (n_tags) free(tags); return flag; }
int bam_smpl_add_samples(bam_smpl_t *bsmpl, char *list, int is_file) { if ( list[0]!='^' ) bsmpl->sample_logic = 1; else list++; int i, nsamples = 0; char **samples = hts_readlist(list, is_file, &nsamples); if ( !nsamples ) return 0; kstring_t ori = {0,0,0}; kstring_t ren = {0,0,0}; bsmpl->sample_list = khash_str2str_init(); for (i=0; i<nsamples; i++) { char *ptr = samples[i]; ori.l = ren.l = 0; int escaped = 0; while ( *ptr ) { if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; } if ( isspace(*ptr) && !escaped ) break; kputc(*ptr, &ori); escaped = 0; ptr++; } if ( *ptr ) { while ( *ptr && isspace(*ptr) ) ptr++; while ( *ptr ) { if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; } if ( isspace(*ptr) && !escaped ) break; kputc(*ptr, &ren); escaped = 0; ptr++; } } khash_str2str_set(bsmpl->sample_list,strdup(ori.s),strdup(ren.l?ren.s:ori.s)); free(samples[i]); } free(samples); free(ori.s); free(ren.s); return nsamples; }
static void init_data(args_t *args) { args->header = args->files->readers[0].header; int i, nsamples = 0, *samples = NULL; if ( args->sample_list && strcmp("-",args->sample_list) ) { for (i=0; i<args->files->nreaders; i++) { int ret = bcf_hdr_set_samples(args->files->readers[i].header,args->sample_list,args->sample_is_file); if ( ret<0 ) error("Error parsing the sample list\n"); else if ( ret>0 ) error("Sample name mismatch: sample #%d not found in the header\n", ret); } if ( args->sample_list[0]!='^' ) { // the sample ordering may be different if not negated int n; char **smpls = hts_readlist(args->sample_list, args->sample_is_file, &n); if ( !smpls ) error("Could not parse %s\n", args->sample_list); if ( n!=bcf_hdr_nsamples(args->files->readers[0].header) ) error("The number of samples does not match, perhaps some are present multiple times?\n"); nsamples = bcf_hdr_nsamples(args->files->readers[0].header); samples = (int*) malloc(sizeof(int)*nsamples); for (i=0; i<n; i++) { samples[i] = bcf_hdr_id2int(args->files->readers[0].header, BCF_DT_SAMPLE,smpls[i]); free(smpls[i]); } free(smpls); } } args->convert = convert_init(args->header, samples, nsamples, args->format_str); if ( args->allow_undef_tags ) convert_set_option(args->convert, allow_undef_tags, 1); free(samples); int max_unpack = convert_max_unpack(args->convert); if ( args->filter_str ) { args->filter = filter_init(args->header, args->filter_str); max_unpack |= filter_max_unpack(args->filter); } args->files->max_unpack = max_unpack; }
int parse_format_flag(const char *str) { int i, flag = 0, n_tags; char **tags = hts_readlist(str, 0, &n_tags); for(i=0; i<n_tags; i++) { if ( !strcasecmp(tags[i],"DP") ) flag |= B2B_FMT_DP; else if ( !strcasecmp(tags[i],"DV") ) flag |= B2B_FMT_DV; else if ( !strcasecmp(tags[i],"SP") ) flag |= B2B_FMT_SP; else if ( !strcasecmp(tags[i],"DP4") ) flag |= B2B_FMT_DP4; else if ( !strcasecmp(tags[i],"DPR") ) flag |= B2B_FMT_DPR; else if ( !strcasecmp(tags[i],"INFO/DPR") ) flag |= B2B_INFO_DPR; else { fprintf(stderr,"Could not parse tag \"%s\" in \"%s\"\n", tags[i], str); exit(1); } free(tags[i]); } if (n_tags) free(tags); return flag; }
int parse_tags(args_t *args, const char *str) { int i, flag = 0, n_tags; char **tags = hts_readlist(str, 0, &n_tags); for(i=0; i<n_tags; i++) { if ( !strcasecmp(tags[i],"AN") ) flag |= SET_AN; else if ( !strcasecmp(tags[i],"AC") ) flag |= SET_AC; else if ( !strcasecmp(tags[i],"NS") ) flag |= SET_NS; else if ( !strcasecmp(tags[i],"AC_Hom") ) flag |= SET_AC_Hom; else if ( !strcasecmp(tags[i],"AC_Het") ) flag |= SET_AC_Het; else if ( !strcasecmp(tags[i],"AC_Hemi") ) flag |= SET_AC_Hemi; else if ( !strcasecmp(tags[i],"AF") ) flag |= SET_AF; else { fprintf(stderr,"Error parsing \"--tags %s\": the tag \"%s\" is not supported\n", str,tags[i]); exit(1); } free(tags[i]); } if (n_tags) free(tags); return flag; }
static void list_columns(args_t *args) { void *has_sample = NULL; if ( args->sample_list ) { has_sample = khash_str2int_init(); int i, nsmpl; char **smpl = hts_readlist(args->sample_list, args->sample_is_file, &nsmpl); for (i=0; i<nsmpl; i++) khash_str2int_inc(has_sample, smpl[i]); free(smpl); } int i; bcf_sr_t *reader = &args->files->readers[0]; for (i=0; i<bcf_hdr_nsamples(reader->header); i++) { if ( has_sample && !khash_str2int_has_key(has_sample, reader->header->samples[i]) ) continue; printf("%s\n", reader->header->samples[i]); } if ( has_sample ) khash_str2int_destroy_free(has_sample); }
int run(int argc, char **argv) { char *trio_samples = NULL, *trio_file = NULL, *rules_fname = NULL, *rules_string = NULL; memset(&args,0,sizeof(args_t)); args.mode = 0; args.output_fname = "-"; static struct option loptions[] = { {"trio",1,0,'t'}, {"trio-file",1,0,'T'}, {"delete",0,0,'d'}, {"list",1,0,'l'}, {"count",0,0,'c'}, {"rules",1,0,'r'}, {"rules-file",1,0,'R'}, {"output",required_argument,NULL,'o'}, {"output-type",required_argument,NULL,'O'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:T:l:cdr:R:o:O:",loptions,NULL)) >= 0) { switch (c) { case 'o': args.output_fname = optarg; break; case 'O': switch (optarg[0]) { case 'b': args.output_type = FT_BCF_GZ; break; case 'u': args.output_type = FT_BCF; break; case 'z': args.output_type = FT_VCF_GZ; break; case 'v': args.output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); }; break; case 'R': rules_fname = optarg; break; case 'r': rules_string = optarg; break; case 'd': args.mode |= MODE_DELETE; break; case 'c': args.mode |= MODE_COUNT; break; case 'l': if ( !strcmp("+",optarg) ) args.mode |= MODE_LIST_GOOD; else if ( !strcmp("x",optarg) ) args.mode |= MODE_LIST_BAD; else error("The argument not recognised: --list %s\n", optarg); break; case 't': trio_samples = optarg; break; case 'T': trio_file = optarg; break; case 'h': case '?': default: error("%s",usage()); break; } } if ( rules_fname ) args.rules = regidx_init(rules_fname, parse_rules, NULL, sizeof(rule_t), &args); else args.rules = init_rules(&args, rules_string); if ( !args.rules ) return -1; args.itr = regitr_init(args.rules); args.itr_ori = regitr_init(args.rules); char *fname = NULL; if ( optind>=argc || argv[optind][0]=='-' ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else error("%s",usage()); } else fname = argv[optind]; if ( !trio_samples && !trio_file ) error("Expected the -t/T option\n"); if ( !args.mode ) error("Expected one of the -c, -d or -l options\n"); if ( args.mode&MODE_DELETE && !(args.mode&(MODE_LIST_GOOD|MODE_LIST_BAD)) ) args.mode |= MODE_LIST_GOOD|MODE_LIST_BAD; args.sr = bcf_sr_init(); if ( !bcf_sr_add_reader(args.sr, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args.sr->errnum)); args.hdr = bcf_sr_get_header(args.sr, 0); args.out_fh = hts_open(args.output_fname,hts_bcf_wmode(args.output_type)); if ( args.out_fh == NULL ) error("Can't write to \"%s\": %s\n", args.output_fname, strerror(errno)); bcf_hdr_write(args.out_fh, args.hdr); int i, n = 0; char **list; if ( trio_samples ) { args.ntrios = 1; args.trios = (trio_t*) calloc(1,sizeof(trio_t)); list = hts_readlist(trio_samples, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); args.trios[0].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.trios[0].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); args.trios[0].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); for (i=0; i<n; i++) free(list[i]); free(list); } if ( trio_file ) { list = hts_readlist(trio_file, 1, &n); args.ntrios = n; args.trios = (trio_t*) calloc(n,sizeof(trio_t)); for (i=0; i<n; i++) { char *ss = list[i], *se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s: %s\n",trio_file, ss); *se = 0; args.trios[i].imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].imother<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; se = strchr(ss, ','); if ( !se ) error("Could not parse %s\n",trio_file); *se = 0; args.trios[i].ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ifather<0 ) error("No such sample: \"%s\"\n", ss); ss = ++se; if ( *ss=='\0' ) error("Could not parse %s\n",trio_file); args.trios[i].ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, ss); if ( args.trios[i].ichild<0 ) error("No such sample: \"%s\"\n", ss); free(list[i]); } free(list); } while ( bcf_sr_next_line(args.sr) ) { bcf1_t *line = bcf_sr_get_line(args.sr,0); line = process(line); if ( line ) { if ( line->errcode ) error("TODO: Unchecked error (%d), exiting\n",line->errcode); bcf_write1(args.out_fh, args.hdr, line); } } fprintf(stderr,"# [1]nOK\t[2]nBad\t[3]nSkipped\t[4]Trio\n"); for (i=0; i<args.ntrios; i++) { trio_t *trio = &args.trios[i]; fprintf(stderr,"%d\t%d\t%d\t%s,%s,%s\n", trio->nok,trio->nbad,args.nrec-(trio->nok+trio->nbad), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->imother), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ifather), bcf_hdr_int2id(args.hdr, BCF_DT_SAMPLE, trio->ichild) ); } free(args.gt_arr); free(args.trios); regitr_destroy(args.itr); regitr_destroy(args.itr_ori); regidx_destroy(args.rules); bcf_sr_destroy(args.sr); if ( hts_close(args.out_fh)!=0 ) error("Error: close failed\n"); return 0; }
int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) { int i, j, nsmpl, free_smpl = 0; char **smpl = NULL; void *exclude = (fname[0]=='^') ? khash_str2int_init() : NULL; if ( exclude || strcmp("-",fname) ) // "-" stands for all samples { smpl = hts_readlist(fname, is_file, &nsmpl); if ( !smpl ) { fprintf(stderr,"Could not read the file: \"%s\"\n", fname); return 0; } if ( exclude ) { for (i=0; i<nsmpl; i++) khash_str2int_inc(exclude, smpl[i]); } free_smpl = 1; } if ( !smpl ) { smpl = files->readers[0].header->samples; // intersection of all samples nsmpl = bcf_hdr_nsamples(files->readers[0].header); } files->samples = NULL; files->n_smpl = 0; for (i=0; i<nsmpl; i++) { if ( exclude && khash_str2int_has_key(exclude,smpl[i]) ) continue; int n_isec = 0; for (j=0; j<files->nreaders; j++) { if ( bcf_hdr_id2int(files->readers[j].header, BCF_DT_SAMPLE, smpl[i])<0 ) break; n_isec++; } if ( n_isec!=files->nreaders ) { fprintf(stderr,"Warning: The sample \"%s\" was not found in %s, skipping\n", smpl[i], files->readers[n_isec].fname); continue; } files->samples = (char**) realloc(files->samples, (files->n_smpl+1)*sizeof(const char*)); files->samples[files->n_smpl++] = strdup(smpl[i]); } if ( exclude ) khash_str2int_destroy(exclude); if ( free_smpl ) { for (i=0; i<nsmpl; i++) free(smpl[i]); free(smpl); } if ( !files->n_smpl ) { if ( files->nreaders>1 ) fprintf(stderr,"No samples in common.\n"); return 0; } for (i=0; i<files->nreaders; i++) { bcf_sr_t *reader = &files->readers[i]; reader->samples = (int*) malloc(sizeof(int)*files->n_smpl); reader->n_smpl = files->n_smpl; for (j=0; j<files->n_smpl; j++) reader->samples[j] = bcf_hdr_id2int(reader->header, BCF_DT_SAMPLE, files->samples[j]); } return 1; }
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { memset(&args,0,sizeof(args_t)); int i; static struct option loptions[] = { {"help", no_argument, 0,'h'}, {"sample-list", required_argument, 0,'s'}, {0,0,0,0} }; char **smps_strs = NULL; int c; while ((c = getopt_long(argc, argv, "?s:h",loptions,NULL)) >= 0) { switch (c) { case 's': smps_strs = hts_readlist(optarg,0,&(args.n_sel_smps)); if ( args.n_sel_smps == 0 ) { fprintf(stderr, "Sample specification not valid.\n"); error("%s", usage()); } break; case 'h': usage(); break; case '?': default: error("%s", usage()); break; } } if ( optind != argc ) usage(); // too many files given args.hdr = bcf_hdr_dup(in); // Samples parsing from header and input option if ( !bcf_hdr_nsamples(args.hdr) ) { error("No samples in input file.\n"); } args.nsmp = bcf_hdr_nsamples(args.hdr); args.selected_smps = (int*) calloc(args.nsmp,sizeof(int)); for ( i = 0; i < args.n_sel_smps; i++ ) { int ind = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, smps_strs[i]); if ( ind == -1 ) { error("Sample '%s' not in input vcf file.\n", smps_strs[i]); } else { args.selected_smps[ind] = 1; } free(smps_strs[i]); } free(smps_strs); /* fprintf(stderr, "Selected samples array:["); for (i=0;i<args.nsmp;i++) { fprintf(stderr, " %i", args.selected_smps[i]); } fprintf(stderr, " ]\n"); */ if ( bcf_hdr_id2int(args.hdr, BCF_DT_ID, "GT")<0 ) error("[E::%s] GT not present in the header\n", __func__); args.gt_arr = NULL; return 0; }
static void init_data(args_t *args) { int i; args->hdr = args->files->readers[0].header; if (args->calc_ac && args->update_info) { bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">"); bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">"); } bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); // setup sample data if (args->sample_names) { void *hdr_samples = khash_str2int_init(); for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)); void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL; int nsmpl; char **smpl = NULL; args->samples = NULL; args->n_samples = 0; smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl); if ( !smpl ) { error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names); } if ( exclude ) { for (i=0; i<nsmpl; i++) { if (!khash_str2int_has_key(hdr_samples,smpl[i])) { if (args->force_samples) { fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); } else { error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } } khash_str2int_inc(exclude, smpl[i]); } for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) { if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)) ) continue; args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*)); args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)); } khash_str2int_destroy(exclude); } else { for (i=0; i<nsmpl; i++) { if (!khash_str2int_has_key(hdr_samples,smpl[i])) { if (args->force_samples) { fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); continue; } else { error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } } args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*)); args->samples[args->n_samples++] = strdup(smpl[i]); } } for (i=0; i<nsmpl; i++) free(smpl[i]); free(smpl); khash_str2int_destroy(hdr_samples); if (args->n_samples == 0) { fprintf(stderr, "Warn: subsetting has removed all samples\n"); args->sites_only = 1; } } if (args->n_samples) args->imap = (int*)malloc(args->n_samples * sizeof(int)); // determine variant types to include/exclude if (args->include_types || args->exclude_types) { if (args->include_types && args->exclude_types) { fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n"); exit(1); } char **type_list = 0; int m = 0, n = 0; const char *q, *p; for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) { if (*p == ',' || *p == 0) { if (m == n) { m = m? m<<1 : 16; type_list = (char**)realloc(type_list, m * sizeof(char*)); } type_list[n] = (char*)calloc(p - q + 1, 1); strncpy(type_list[n++], q, p - q); q = p + 1; if (*p == 0) break; } } type_list = (char**)realloc(type_list, n * sizeof(char*)); if (args->include_types) { args->include = 0; for (i = 0; i < n; ++i) { if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP; else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL; else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP; else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } } if (args->exclude_types) { args->exclude = 0; for (i = 0; i < n; ++i) { if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP; else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL; else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP; else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } } for (i = 0; i < n; ++i) free(type_list[i]); free(type_list); } // setup output char modew[8]; strcpy(modew, "w"); if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel); if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); // headers: hdr=full header, hsub=subset header, hnull=sites only header if (args->sites_only){ args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0); bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL); } if (args->n_samples > 0) { args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap); if ( !args->hsub ) error("Error occurred while subsetting samples\n"); if ( args->n_samples != bcf_hdr_nsamples(args->hsub) ) { int i; for (i=0; i<args->n_samples; i++) if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]); } } if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); }
static void init_data(args_t *args) { args->prev_rid = args->skip_rid = -1; args->hdr = args->files->readers[0].header; if ( !args->sample ) { if ( bcf_hdr_nsamples(args->hdr)>1 ) error("Missing the option -s, --sample\n"); args->sample = strdup(args->hdr->samples[0]); } if ( !bcf_hdr_nsamples(args->hdr) ) error("No samples in the VCF?\n"); // Set samples kstring_t str = {0,0,0}; if ( args->estimate_AF && strcmp("-",args->estimate_AF) ) { int i, n; char **smpls = hts_readlist(args->estimate_AF, 1, &n); // Make sure the query sample is included for (i=0; i<n; i++) if ( !strcmp(args->sample,smpls[i]) ) break; // Add the query sample if not present if ( i!=n ) kputs(args->sample, &str); for (i=0; i<n; i++) { if ( str.l ) kputc(',', &str); kputs(smpls[i], &str); free(smpls[i]); } free(smpls); } else if ( !args->estimate_AF ) kputs(args->sample, &str); if ( str.l ) { int ret = bcf_hdr_set_samples(args->hdr, str.s, 0); if ( ret<0 ) error("Error parsing the list of samples: %s\n", str.s); else if ( ret>0 ) error("The %d-th sample not found in the VCF\n", ret); } if ( args->af_tag ) if ( !bcf_hdr_idinfo_exists(args->hdr,BCF_HL_INFO,bcf_hdr_id2int(args->hdr,BCF_DT_ID,args->af_tag)) ) error("No such INFO tag in the VCF: %s\n", args->af_tag); args->nsmpl = bcf_hdr_nsamples(args->hdr); args->ismpl = bcf_hdr_id2int(args->hdr, BCF_DT_SAMPLE, args->sample); free(str.s); int i; for (i=0; i<256; i++) args->pl2p[i] = pow(10., -i/10.); // Init transition matrix and HMM double tprob[4]; MAT(tprob,2,STATE_HW,STATE_HW) = 1 - args->t2AZ; MAT(tprob,2,STATE_HW,STATE_AZ) = args->t2HW; MAT(tprob,2,STATE_AZ,STATE_HW) = args->t2AZ; MAT(tprob,2,STATE_AZ,STATE_AZ) = 1 - args->t2HW; if ( args->genmap_fname ) { args->hmm = hmm_init(2, tprob, 0); hmm_set_tprob_func(args->hmm, set_tprob_genmap, args); } else if ( args->rec_rate > 0 ) { args->hmm = hmm_init(2, tprob, 0); hmm_set_tprob_func(args->hmm, set_tprob_recrate, args); } else args->hmm = hmm_init(2, tprob, 10000); // print header printf("# This file was produced by: bcftools roh(%s+htslib-%s)\n", bcftools_version(),hts_version()); printf("# The command line was:\tbcftools %s", args->argv[0]); for (i=1; i<args->argc; i++) printf(" %s",args->argv[i]); printf("\n#\n"); printf("# [1]Chromosome\t[2]Position\t[3]State (0:HW, 1:AZ)\t[4]Quality\n"); }
int bam_smpl_add_readgroups(bam_smpl_t *bsmpl, char *list, int is_file) { if ( list[0]!='^' ) bsmpl->rg_logic = 1; else list++; int i, nrows = 0; char **rows = hts_readlist(list, is_file, &nrows); if ( !nrows ) return 0; kstring_t fld1 = {0,0,0}; kstring_t fld2 = {0,0,0}; kstring_t fld3 = {0,0,0}; bsmpl->rg_list = khash_str2str_init(); for (i=0; i<nrows; i++) { char *ptr = rows[i]; fld1.l = fld2.l = fld3.l = 0; int escaped = 0; while ( *ptr ) { if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; } if ( isspace(*ptr) && !escaped ) break; kputc(*ptr, &fld1); escaped = 0; ptr++; } if ( *ptr ) { while ( *ptr && isspace(*ptr) ) ptr++; while ( *ptr ) { if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; } if ( isspace(*ptr) && !escaped ) break; kputc(*ptr, &fld2); escaped = 0; ptr++; } } if ( *ptr ) { while ( *ptr && isspace(*ptr) ) ptr++; while ( *ptr ) { if ( *ptr=='\\' && !escaped ) { escaped = 1; ptr++; continue; } if ( isspace(*ptr) && !escaped ) break; kputc(*ptr, &fld3); escaped = 0; ptr++; } } if ( fld3.l ) { // ID FILE SAMPLE kputc('\t',&fld1); kputs(fld2.s,&fld1); fld2.l = 0; kputs(fld3.s,&fld2); } // fld2.s now contains a new sample name. If NULL, use \t to keep the bam header name char *value = khash_str2str_get(bsmpl->rg_list,fld1.s); if ( !value ) khash_str2str_set(bsmpl->rg_list,strdup(fld1.s),strdup(fld2.l?fld2.s:"\t")); else if ( strcmp(value,fld2.l?fld2.s:"\t") ) error("Error: The read group \"%s\" was assigned to two different samples: \"%s\" and \"%s\"\n", fld1.s,value,fld2.l?fld2.s:"\t"); free(rows[i]); } free(rows); free(fld1.s); free(fld2.s); free(fld3.s); return nrows; }
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { char *trio_samples = NULL, *unrelated_samples = NULL; memset(&args,0,sizeof(args_t)); args.prev_rid = -1; args.hdr = in; args.pij = 2e-8; args.pgt_err = 1e-9; static struct option loptions[] = { {"prefix",1,0,'p'}, {"trio",1,0,'t'}, {"unrelated",1,0,'u'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:u:p:",loptions,NULL)) >= 0) { switch (c) { case 'p': args.prefix = optarg; break; case 't': trio_samples = optarg; break; case 'u': unrelated_samples = optarg; break; case 'h': case '?': default: error("%s", usage()); break; } } if ( optind != argc ) error(usage()); if ( trio_samples && unrelated_samples ) error("Expected only one of the -t/-u options\n"); if ( !trio_samples && !unrelated_samples ) error("Expected one of the -t/-u options\n"); if ( !args.prefix ) error("Expected the -p option\n"); int ret = bcf_hdr_set_samples(args.hdr, trio_samples ? trio_samples : unrelated_samples, 0); if ( ret<0 ) error("Could not parse samples: %s\n", trio_samples ? trio_samples : unrelated_samples); else if ( ret>0 ) error("%d-th sample not found: %s\n", ret,trio_samples ? trio_samples : unrelated_samples); if ( trio_samples ) { int i,n = 0; char **list = hts_readlist(trio_samples, 0, &n); if ( n!=3 ) error("Expected three sample names with -t\n"); args.imother = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.ifather = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); args.ichild = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[2]); for (i=0; i<n; i++) free(list[i]); free(list); args.set_observed_prob = set_observed_prob_trio; args.mode = C_TRIO; init_hmm_trio(&args); } else { int i,n = 0; char **list = hts_readlist(unrelated_samples, 0, &n); if ( n!=2 ) error("Expected two sample names with -u\n"); args.isample = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[0]); args.jsample = bcf_hdr_id2int(args.hdr, BCF_DT_SAMPLE, list[1]); for (i=0; i<n; i++) free(list[i]); free(list); args.set_observed_prob = set_observed_prob_unrelated; args.mode = C_UNRL; init_hmm_unrelated(&args); } return 1; }
int main_vcfquery(int argc, char *argv[]) { int c, collapse = 0; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = { {"help",0,0,'h'}, {"list-samples",0,0,'l'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"format",1,0,'f'}, {"output-file",1,0,'o'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"annots",1,0,'a'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"print-header",0,0,'H'}, {"collapse",1,0,'c'}, {"vcf-list",1,0,'v'}, {"allow-undef-tags",0,0,'u'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) { switch (c) { case 'o': args->fn_out = optarg; break; case 'f': args->format_str = strdup(optarg); break; case 'H': args->print_header = 1; break; case 'v': args->vcf_list = optarg; break; case 'c': if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; case 'a': { kstring_t str = {0,0,0}; kputs("%CHROM\t%POS\t%MASK\t%REF\t%ALT\t%", &str); char *p = optarg; while ( *p ) { if ( *p==',' ) kputs("\t%", &str); else kputc(*p, &str); p++; } kputc('\n', &str); args->format_str = str.s; break; } case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': args->list_columns = 1; break; case 'u': args->allow_undef_tags = 1; break; case 's': args->sample_list = optarg; break; case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); } } char *fname = NULL; if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; } else fname = argv[optind]; if ( args->list_columns ) { if ( !fname ) error("Missing the VCF file name\n"); args->files = bcf_sr_init(); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); list_columns(args); bcf_sr_destroy(args->files); free(args); return 0; } if ( !args->format_str ) usage(); args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout; if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( !args->vcf_list ) { if ( !fname ) usage(); args->files = bcf_sr_init(); args->files->collapse = collapse; if ( optind+1 < argc ) args->files->require_index = 1; if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } while ( fname ) { if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); fname = ++optind < argc ? argv[optind] : NULL; } init_data(args); query_vcf(args); free(args->format_str); destroy_data(args); bcf_sr_destroy(args->files); fclose(args->out); free(args); return 0; } // multiple VCFs int i, k, nfiles, prev_nsamples = 0; char **fnames, **prev_samples = NULL; fnames = hts_readlist(args->vcf_list, 1, &nfiles); if ( !nfiles ) error("No files in %s?\n", args->vcf_list); for (i=0; i<nfiles; i++) { args->files = bcf_sr_init(); args->files->collapse = collapse; if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); if ( optind < argc ) args->files->require_index = 1; if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(args->files, fnames[i]) ) error("Failed to open %s: %s\n", fnames[i],bcf_sr_strerror(args->files->errnum)); for (k=optind; k<argc; k++) if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum)); init_data(args); if ( i==0 ) prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header)); else { args->print_header = 0; if ( compare_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header), prev_samples, prev_nsamples) ) error("Different samples in %s and %s\n", fnames[i-1],fnames[i]); } query_vcf(args); destroy_data(args); bcf_sr_destroy(args->files); } fclose(args->out); destroy_list(fnames, nfiles); destroy_list(prev_samples, prev_nsamples); free(args->format_str); free(args); return 0; }
/* * Reads sample names and their ploidy (optional) from a file. * Alternatively, if no such file exists, the file name is interpreted * as a comma-separated list of samples. When ploidy is not present, * the default ploidy 2 is assumed. */ static void set_samples(args_t *args, const char *fn, int is_file) { int i, nlines; char **lines = hts_readlist(fn, is_file, &nlines); if ( !lines ) error("Could not read the file: %s\n", fn); int nsmpls; char **smpls = parse_ped_samples(&args->aux, lines, nlines, &nsmpls); if ( smpls ) { for (i=0; i<nlines; i++) free(lines[i]); free(lines); lines = smpls; nlines = nsmpls; } args->samples_map = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); // for subsetting args->sample2sex = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); int dflt_sex_id = ploidy_add_sex(args->ploidy, "F"); for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) args->sample2sex[i] = dflt_sex_id; int *old2new = (int*) malloc(sizeof(int)*bcf_hdr_nsamples(args->aux.hdr)); for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) old2new[i] = -1; int nsmpl = 0, map_needed = 0; for (i=0; i<nlines; i++) { char *ss = lines[i]; while ( *ss && isspace(*ss) ) ss++; if ( !*ss ) error("Could not parse: %s\n", lines[i]); if ( *ss=='#' ) continue; char *se = ss; while ( *se && !isspace(*se) ) se++; char x = *se, *xptr = se; *se = 0; int ismpl = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, ss); if ( ismpl < 0 ) { fprintf(stderr,"Warning: No such sample in the VCF: %s\n",ss); continue; } ss = se+1; while ( *ss && isspace(*ss) ) ss++; if ( !*ss ) ss = "2"; // default ploidy se = ss; while ( *se && !isspace(*se) ) se++; if ( se==ss ) { *xptr = x; error("Could not parse: \"%s\"\n", lines[i]); } if ( ss[1]==0 && (ss[0]=='0' || ss[0]=='1' || ss[0]=='2') ) args->sample2sex[nsmpl] = -1*(ss[0]-'0'); else args->sample2sex[nsmpl] = ploidy_add_sex(args->ploidy, ss); if ( ismpl!=nsmpl ) map_needed = 1; args->samples_map[nsmpl] = ismpl; old2new[ismpl] = nsmpl; nsmpl++; } for (i=0; i<args->aux.nfams; i++) { int j, nmiss = 0; family_t *fam = &args->aux.fams[i]; for (j=0; j<3; j++) { fam->sample[i] = old2new[fam->sample[i]]; if ( fam->sample[i]<0 ) nmiss++; } assert( nmiss==0 || nmiss==3 ); } free(old2new); if ( !map_needed ) { free(args->samples_map); args->samples_map = NULL; } args->nsamples = nsmpl; args->samples = lines; }