int main_vcfgtcheck(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->files = bcf_sr_init(); args->argc = argc; args->argv = argv; set_cwd(args); char *regions = NULL, *targets = NULL; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = { {"GTs-only",1,0,'G'}, {"all-sites",0,0,'a'}, {"homs-only",0,0,'H'}, {"help",0,0,'h'}, {"genotypes",1,0,'g'}, {"plot",1,0,'p'}, {"target-sample",1,0,'S'}, {"query-sample",1,0,'s'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:",loptions,NULL)) >= 0) { switch (c) { case 'G': args->no_PLs = atoi(optarg); break; case 'a': args->all_sites = 1; break; case 'H': args->hom_only = 1; break; case 'g': args->gt_fname = optarg; break; case 'p': args->plot = optarg; break; case 'S': args->target_sample = optarg; break; case 's': args->query_sample = optarg; break; case 'r': regions = optarg; break; case 'R': regions = optarg; regions_is_file = 1; break; case 't': targets = optarg; break; case 'T': targets = optarg; targets_is_file = 1; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); } } if ( argc==optind || argc>optind+1 ) usage(); // none or too many files given if ( !args->gt_fname ) args->cross_check = 1; // no genotype file, run in cross-check mode else args->files->require_index = 1; if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions); if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets); if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open or the file not indexed: %s\n", argv[optind]); if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) error("Failed to open or the file not indexed: %s\n", args->gt_fname); args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS; if ( args->plot ) args->plot = init_prefix(args->plot); init_data(args); if ( args->cross_check ) cross_check_gts(args); else check_gt(args); destroy_data(args); bcf_sr_destroy(args->files); if (args->plot) free(args->plot); free(args); return 0; }
int main_vcfroh(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; args->files = bcf_sr_init(); args->t2AZ = 6.7e-8; args->t2HW = 5e-9; args->rec_rate = 0; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = { {"AF-tag",1,0,0}, {"AF-file",1,0,1}, {"AF-dflt",1,0,2}, {"estimate-AF",1,0,'e'}, {"GTs-only",1,0,'G'}, {"sample",1,0,'s'}, {"hw-to-az",1,0,'a'}, {"az-to-hw",1,0,'H'}, {"viterbi-training",0,0,'V'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"genetic-map",1,0,'m'}, {"rec-rate",1,0,'M'}, {"skip-indels",0,0,'I'}, {0,0,0,0} }; int naf_opts = 0; char *tmp; while ((c = getopt_long(argc, argv, "h?r:R:t:T:H:a:s:m:M:G:Ia:e:V",loptions,NULL)) >= 0) { switch (c) { case 0: args->af_tag = optarg; naf_opts++; break; case 1: args->af_fname = optarg; naf_opts++; break; case 2: args->dflt_AF = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: --AF-dflt %s\n", optarg); break; case 'e': args->estimate_AF = optarg; naf_opts++; break; case 'I': args->snps_only = 1; break; case 'G': args->fake_PLs = 1; args->unseen_PL = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -G %s\n", optarg); args->unseen_PL = pow(10,-args->unseen_PL/10.); break; case 'm': args->genmap_fname = optarg; break; case 'M': args->rec_rate = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -M %s\n", optarg); break; case 's': args->sample = strdup(optarg); break; case 'a': args->t2AZ = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -a %s\n", optarg); break; case 'H': args->t2HW = strtod(optarg,&tmp); if ( *tmp ) error("Could not parse: -H %s\n", optarg); break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 'V': args->vi_training = 1; break; case 'h': case '?': usage(args); break; default: error("Unknown argument: %s\n", optarg); } } if ( argc<optind+1 ) usage(args); if ( args->t2AZ<0 || args->t2AZ>1 ) error("Error: The parameter --hw-to-az is not in [0,1]\n", args->t2AZ); if ( args->t2HW<0 || args->t2HW>1 ) error("Error: The parameter --az-to-hw is not in [0,1]\n", args->t2HW); if ( naf_opts>1 ) error("Error: The options --AF-tag, --AF-file and -e are mutually exclusive\n"); if ( args->af_fname && args->targets_list ) error("Error: The options --AF-file and -t are mutually exclusive\n"); if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( args->af_fname ) { if ( bcf_sr_set_targets(args->files, args->af_fname, 1, 3)<0 ) error("Failed to read the targets: %s\n", args->af_fname); } if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open %s: %s\n", argv[optind],bcf_sr_strerror(args->files->errnum)); init_data(args); while ( bcf_sr_next_line(args->files) ) { vcfroh(args, args->files->readers[0].buffer[0]); } vcfroh(args, NULL); fprintf(pysamerr,"Number of lines: total/processed: %d/%d\n", args->ntot,args->nused); destroy_data(args); free(args); return 0; }
static void init_data(args_t *args) { bcf1_t *line = NULL; // With phased concat, the chunks overlap and come in the right order. To // avoid opening all files at once, store start positions to recognise need // for the next one. This way we can keep only two open chunks at once. if ( args->phased_concat ) { args->start_pos = (int*) malloc(sizeof(int)*args->nfnames); line = bcf_init(); } kstring_t str = {0,0,0}; int i, prev_chrid = -1; for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !args->out_hdr ) args->out_hdr = bcf_hdr_dup(hdr); else { bcf_hdr_combine(args->out_hdr, hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); int j; for (j=0; j<bcf_hdr_nsamples(hdr); j++) if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) ) error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); } if ( args->phased_concat ) { int ret = bcf_read(fp, hdr, line); if ( ret!=0 ) args->start_pos[i] = -2; // empty file else { int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line)); args->start_pos[i] = chrid==prev_chrid ? line->pos : -1; prev_chrid = chrid; } } bcf_hdr_destroy(hdr); hts_close(fp); } free(str.s); if ( line ) bcf_destroy(line); args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int)); if ( args->phased_concat ) { bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">"); bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">"); } bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); bcf_hdr_write(args->out_fh, args->out_hdr); if ( args->allow_overlaps ) { args->files = bcf_sr_init(); args->files->require_index = 1; if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } for (i=0; i<args->nfnames; i++) if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum)); } else if ( args->phased_concat ) { // Remove empty files from the list int nok = 0; while (1) { while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++; if ( nok==args->nfnames ) break; i = nok; while ( i<args->nfnames && args->start_pos[i]==-2 ) i++; if ( i==args->nfnames ) break; int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp; char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str; } for (i=nok; i<args->nfnames; i++) free(args->fnames[i]); args->nfnames = nok; for (i=1; i<args->nfnames; i++) if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] ) error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]); args->prev_chr = -1; args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->files = bcf_sr_init(); args->files->require_index = 1; args->ifname = 0; } }
void genrich_data::readReferenceGenotypes(string fvcf) { vector < int > mappingS; //Opening files vrb.title("Reading variant list in [" + fvcf + "] MAF=" + stb.str(threshold_maf)); bcf_srs_t * sr = bcf_sr_init(); if(!(bcf_sr_add_reader (sr, fvcf.c_str()))) { switch (sr->errnum) { case not_bgzf: vrb.error("File not compressed with bgzip!"); case idx_load_failed: vrb.error("Impossible to load index file!"); case file_type_error: vrb.error("File format not detected by htslib!"); default : vrb.error("Unknown error!"); } } //Sample processing int included_sample = 0; int n_samples = bcf_hdr_nsamples(sr->readers[0].header); for (int i = 0 ; i < n_samples ; i ++) { mappingS.push_back(findSample(string(sr->readers[0].header->samples[i]))); if (mappingS.back() >= 0) included_sample ++; } vrb.bullet("#samples = " + stb.str(included_sample)); //Variant processing unsigned int n_excludedV_mult = 0, n_excludedV_void = 0, n_excludedV_rare = 0, n_excludedV_uchr = 0, n_line = 0, n_excludedV_toofar = 0; int ngt, ngt_arr = 0, *gt_arr = NULL; bcf1_t * line; while(bcf_sr_next_line (sr)) { line = bcf_sr_get_line(sr, 0); if (line->n_allele == 2) { bcf_unpack(line, BCF_UN_STR); string sid = string(line->d.id); string chr = string(bcf_hdr_id2name(sr->readers[0].header, line->rid)); int chr_idx = findCHR(chr); if (chr_idx >= 0) { unsigned int pos = line->pos + 1; ngt = bcf_get_genotypes(sr->readers[0].header, line, >_arr, &ngt_arr); if (ngt == 2*n_samples) { double freq = 0.0, tot = 0.0; for(int i = 0 ; i < n_samples ; i ++) { assert(gt_arr[2*i+0] != bcf_gt_missing && gt_arr[2*i+1] != bcf_gt_missing); if (mappingS[i] >= 0) { freq += bcf_gt_allele(gt_arr[2*i+0]) + bcf_gt_allele(gt_arr[2*i+1]); tot += 2.0; } } double maf = freq / tot; if (maf > 0.5) maf = 1.0 - maf; if (maf >= threshold_maf) { int dist_tss = getDistance(chr_idx, pos); if (dist_tss < 1e6) { string tmp_id = chr + "_" + stb.str(pos); genotype_uuid.insert(pair < string, unsigned int > (tmp_id, genotype_pos.size())); genotype_chr.push_back(chr_idx); genotype_pos.push_back(pos); genotype_maf.push_back(maf); genotype_dist.push_back(dist_tss); genotype_haps.push_back(vector < bool > (2 * included_sample, false)); for(int i = 0 ; i < n_samples ; i ++) { if (mappingS[i] >= 0) { genotype_haps.back()[2 * mappingS[i] + 0] = bcf_gt_allele(gt_arr[2 * i + 0]); genotype_haps.back()[2 * mappingS[i] + 1] = bcf_gt_allele(gt_arr[2 * i + 1]); } } } else n_excludedV_toofar++; } else n_excludedV_rare ++; } else n_excludedV_void ++; } else n_excludedV_uchr ++; } else n_excludedV_mult ++; if (n_line % 100000 == 0) vrb.bullet("#lines = " + stb.str(n_line)); n_line ++; } genotype_qtl = vector < bool > (genotype_pos.size(), false); genotype_gwas = vector < bool > (genotype_pos.size(), false); genotype_bin = vector < int > (genotype_pos.size(), -1); //Finalize bcf_sr_destroy(sr); vrb.bullet(stb.str(genotype_pos.size()) + " variants included"); if (n_excludedV_mult > 0) vrb.bullet(stb.str(n_excludedV_mult) + " multi-allelic variants excluded"); if (n_excludedV_uchr > 0) vrb.bullet(stb.str(n_excludedV_uchr) + " variants with unreferenced chromosome in --tss"); if (n_excludedV_rare > 0) vrb.bullet(stb.str(n_excludedV_rare) + " maf filtered variants"); if (n_excludedV_toofar > 0) vrb.bullet(stb.str(n_excludedV_toofar) + " too far variants"); }
int main_vcfisec(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->files = bcf_sr_init(); args->argc = argc; args->argv = argv; args->output_type = FT_VCF; static struct option loptions[] = { {"help",0,0,'h'}, {"collapse",1,0,'c'}, {"complement",1,0,'C'}, {"apply-filters",1,0,'f'}, {"nfiles",1,0,'n'}, {"prefix",1,0,'p'}, {"write",1,0,'w'}, {"targets",1,0,'t'}, {"regions",1,0,'r'}, {"output-type",1,0,'O'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hc:r:p:n:w:t:Cf:O:",loptions,NULL)) >= 0) { switch (c) { case 'O': switch (optarg[0]) { case 'b': args->output_type = FT_BCF_GZ; break; case 'u': args->output_type = FT_BCF; break; case 'z': args->output_type = FT_VCF_GZ; break; case 'v': args->output_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'c': if ( !strcmp(optarg,"snps") ) args->files->collapse |= COLLAPSE_SNPS; else if ( !strcmp(optarg,"indels") ) args->files->collapse |= COLLAPSE_INDELS; else if ( !strcmp(optarg,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(optarg,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"some") ) args->files->collapse |= COLLAPSE_SOME; else if ( !strcmp(optarg,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; case 'f': args->files->apply_filters = optarg; break; case 'C': args->isec_op = OP_COMPLEMENT; break; case 'r': args->regions_fname = optarg; break; case 't': args->targets_fname = optarg; break; case 'p': args->prefix = optarg; break; case 'w': args->write_files = optarg; break; case 'n': { char *p = optarg; if ( *p=='-' ) { args->isec_op = OP_MINUS; p++; } else if ( *p=='+' ) { args->isec_op = OP_PLUS; p++; } else if ( *p=='=' ) { args->isec_op = OP_EQUAL; p++; } else if ( isdigit(*p) ) args->isec_op = OP_EQUAL; else error("Could not parse --nfiles %s\n", optarg); if ( sscanf(p,"%d",&args->isec_n)!=1 ) error("Could not parse --nfiles %s\n", optarg); } break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); } } if ( argc-optind<1 ) usage(); // no file given if ( args->targets_fname && bcf_sr_set_targets(args->files, args->targets_fname,0)<0 ) error("Failed to read the targets: %s\n", args->targets_fname); if ( args->regions_fname && bcf_sr_set_regions(args->files, args->regions_fname)<0 ) error("Failed to read the regions: %s\n", args->regions_fname); if ( argc-optind==2 && !args->isec_op ) { args->isec_op = OP_VENN; if ( !args->prefix ) error("Expected the -p option\n"); } if ( !args->targets_fname ) { if ( argc-optind<2 ) error("Expected multiple files or the --targets option\n"); if ( !args->isec_op ) error("Expected two file names or one of the options --complement, --nfiles or --targets\n"); } args->files->require_index = 1; while (optind<argc) { if ( !bcf_sr_add_reader(args->files, argv[optind]) ) error("Failed to open: %s\n", argv[optind]); optind++; } init_data(args); isec_vcf(args); destroy_data(args); bcf_sr_destroy(args->files); free(args); return 0; }
int main_vcfquery(int argc, char *argv[]) { int c, collapse = 0; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->argc = argc; args->argv = argv; int regions_is_file = 0, targets_is_file = 0; static struct option loptions[] = { {"help",0,0,'h'}, {"list-samples",0,0,'l'}, {"include",1,0,'i'}, {"exclude",1,0,'e'}, {"format",1,0,'f'}, {"output-file",1,0,'o'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {"annots",1,0,'a'}, {"samples",1,0,'s'}, {"samples-file",1,0,'S'}, {"print-header",0,0,'H'}, {"collapse",1,0,'c'}, {"vcf-list",1,0,'v'}, {"allow-undef-tags",0,0,'u'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "hlr:R:f:a:s:S:Ht:T:c:v:i:e:o:u",loptions,NULL)) >= 0) { switch (c) { case 'o': args->fn_out = optarg; break; case 'f': args->format_str = strdup(optarg); break; case 'H': args->print_header = 1; break; case 'v': args->vcf_list = optarg; break; case 'c': if ( !strcmp(optarg,"snps") ) collapse |= COLLAPSE_SNPS; else if ( !strcmp(optarg,"indels") ) collapse |= COLLAPSE_INDELS; else if ( !strcmp(optarg,"both") ) collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(optarg,"any") ) collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"all") ) collapse |= COLLAPSE_ANY; else if ( !strcmp(optarg,"some") ) collapse |= COLLAPSE_SOME; else error("The --collapse string \"%s\" not recognised.\n", optarg); break; case 'a': { kstring_t str = {0,0,0}; kputs("%CHROM\t%POS\t%MASK\t%REF\t%ALT\t%", &str); char *p = optarg; while ( *p ) { if ( *p==',' ) kputs("\t%", &str); else kputc(*p, &str); p++; } kputc('\n', &str); args->format_str = str.s; break; } case 'e': args->filter_str = optarg; args->filter_logic |= FLT_EXCLUDE; break; case 'i': args->filter_str = optarg; args->filter_logic |= FLT_INCLUDE; break; case 'r': args->regions_list = optarg; break; case 'R': args->regions_list = optarg; regions_is_file = 1; break; case 't': args->targets_list = optarg; break; case 'T': args->targets_list = optarg; targets_is_file = 1; break; case 'l': args->list_columns = 1; break; case 'u': args->allow_undef_tags = 1; break; case 's': args->sample_list = optarg; break; case 'S': args->sample_list = optarg; args->sample_is_file = 1; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); } } char *fname = NULL; if ( optind>=argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; } else fname = argv[optind]; if ( args->list_columns ) { if ( !fname ) error("Missing the VCF file name\n"); args->files = bcf_sr_init(); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); list_columns(args); bcf_sr_destroy(args->files); free(args); return 0; } if ( !args->format_str ) usage(); args->out = args->fn_out ? fopen(args->fn_out, "w") : stdout; if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); if ( !args->vcf_list ) { if ( !fname ) usage(); args->files = bcf_sr_init(); args->files->collapse = collapse; if ( optind+1 < argc ) args->files->require_index = 1; if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } while ( fname ) { if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); fname = ++optind < argc ? argv[optind] : NULL; } init_data(args); query_vcf(args); free(args->format_str); destroy_data(args); bcf_sr_destroy(args->files); fclose(args->out); free(args); return 0; } // multiple VCFs int i, k, nfiles, prev_nsamples = 0; char **fnames, **prev_samples = NULL; fnames = hts_readlist(args->vcf_list, 1, &nfiles); if ( !nfiles ) error("No files in %s?\n", args->vcf_list); for (i=0; i<nfiles; i++) { args->files = bcf_sr_init(); args->files->collapse = collapse; if ( args->regions_list && bcf_sr_set_regions(args->files, args->regions_list, regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); if ( optind < argc ) args->files->require_index = 1; if ( args->targets_list ) { if ( bcf_sr_set_targets(args->files, args->targets_list,targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", args->targets_list); } if ( !bcf_sr_add_reader(args->files, fnames[i]) ) error("Failed to open %s: %s\n", fnames[i],bcf_sr_strerror(args->files->errnum)); for (k=optind; k<argc; k++) if ( !bcf_sr_add_reader(args->files, argv[k]) ) error("Failed to open %s: %s\n", argv[k],bcf_sr_strerror(args->files->errnum)); init_data(args); if ( i==0 ) prev_samples = copy_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header)); else { args->print_header = 0; if ( compare_header(args->header, args->files->readers[0].header->samples, bcf_hdr_nsamples(args->files->readers[0].header), prev_samples, prev_nsamples) ) error("Different samples in %s and %s\n", fnames[i-1],fnames[i]); } query_vcf(args); destroy_data(args); bcf_sr_destroy(args->files); } fclose(args->out); destroy_list(fnames, nfiles); destroy_list(prev_samples, prev_nsamples); free(args->format_str); free(args); return 0; }
int main_vcfgtcheck(int argc, char *argv[]) { int c; args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->files = bcf_sr_init(); args->argc = argc; args->argv = argv; set_cwd(args); char *regions = NULL, *targets = NULL; int regions_is_file = 0, targets_is_file = 0; // In simulated sample swaps the minimum error was 0.3 and maximum intra-sample error was 0.23 // - min_inter: pairs with smaller err value will be considered identical // - max_intra: pairs with err value bigger than abs(max_intra_err) will be considered // different. If negative, the cutoff may be heuristically lowered args->min_inter_err = 0.23; args->max_intra_err = -0.3; static struct option loptions[] = { {"cluster",1,0,'c'}, {"GTs-only",1,0,'G'}, {"all-sites",0,0,'a'}, {"homs-only",0,0,'H'}, {"help",0,0,'h'}, {"genotypes",1,0,'g'}, {"plot",1,0,'p'}, {"target-sample",1,0,'S'}, {"query-sample",1,0,'s'}, {"regions",1,0,'r'}, {"regions-file",1,0,'R'}, {"targets",1,0,'t'}, {"targets-file",1,0,'T'}, {0,0,0,0} }; char *tmp; while ((c = getopt_long(argc, argv, "hg:p:s:S:Hr:R:at:T:G:c:",loptions,NULL)) >= 0) { switch (c) { case 'c': args->min_inter_err = strtod(optarg,&tmp); if ( *tmp ) { if ( *tmp!=',') error("Could not parse: -c %s\n", optarg); args->max_intra_err = strtod(tmp+1,&tmp); if ( *tmp ) error("Could not parse: -c %s\n", optarg); } break; case 'G': args->no_PLs = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --GTs-only %s\n", optarg); break; case 'a': args->all_sites = 1; break; case 'H': args->hom_only = 1; break; case 'g': args->gt_fname = optarg; break; case 'p': args->plot = optarg; break; case 'S': args->target_sample = optarg; break; case 's': args->query_sample = optarg; break; case 'r': regions = optarg; break; case 'R': regions = optarg; regions_is_file = 1; break; case 't': targets = optarg; break; case 'T': targets = optarg; targets_is_file = 1; break; case 'h': case '?': usage(); default: error("Unknown argument: %s\n", optarg); } } char *fname = NULL; if ( optind==argc ) { if ( !isatty(fileno((FILE *)stdin)) ) fname = "-"; // reading from stdin else usage(); // no files given } else fname = argv[optind]; if ( argc>optind+1 ) usage(); // too many files given if ( !args->gt_fname ) args->cross_check = 1; // no genotype file, run in cross-check mode else args->files->require_index = 1; if ( regions && bcf_sr_set_regions(args->files, regions, regions_is_file)<0 ) error("Failed to read the regions: %s\n", regions); if ( targets && bcf_sr_set_targets(args->files, targets, targets_is_file, 0)<0 ) error("Failed to read the targets: %s\n", targets); if ( !bcf_sr_add_reader(args->files, fname) ) error("Failed to open %s: %s\n", fname,bcf_sr_strerror(args->files->errnum)); if ( args->gt_fname && !bcf_sr_add_reader(args->files, args->gt_fname) ) error("Failed to open %s: %s\n", args->gt_fname,bcf_sr_strerror(args->files->errnum)); args->files->collapse = COLLAPSE_SNPS|COLLAPSE_INDELS; if ( args->plot ) args->plot = init_prefix(args->plot); init_data(args); if ( args->cross_check ) cross_check_gts(args); else check_gt(args); destroy_data(args); bcf_sr_destroy(args->files); if (args->plot) free(args->plot); free(args); return 0; }
int run(int argc, char **argv) { args_t *args = (args_t*) calloc(1,sizeof(args_t)); args->nsites = 10; args->min_hets = 0.3; args->background = "X:60001-2699520"; static struct option loptions[] = { {"verbose",1,0,'v'}, {"ploidy",1,0,'p'}, {"nsites",1,0,'n'}, {"guess",1,0,'g'}, {"min-hets",1,0,'m'}, {"background",1,0,'b'}, {0,0,0,0} }; char c, *tmp, *ploidy_fname = NULL; while ((c = getopt_long(argc, argv, "p:n:g:m:vb:",loptions,NULL)) >= 0) { switch (c) { case 'b': if ( !strcmp("-",optarg) ) args->background = NULL; else args->background = optarg; break; case 'v': args->verbose = 1; break; case 'g': if ( !strcasecmp(optarg,"GT") ) args->guess = GUESS_GT; else if ( !strcasecmp(optarg,"PL") ) args->guess = GUESS_PL; else if ( !strcasecmp(optarg,"GL") ) args->guess = GUESS_GL; else error("The argument not recognised, expected --guess GT, --guess PL or --guess GL: %s\n", optarg); break; case 'm': args->min_hets = strtod(optarg,&tmp); if ( *tmp ) error("Unexpected argument to --min-hets: %s\n", optarg); break; case 'p': ploidy_fname = optarg; break; case 'n': args->nsites = strtol(optarg,&tmp,10); if (*tmp) error("Unexpected argument to --nsites: %s\n", optarg); break; case 'h': case '?': default: error("%s", usage()); break; } } args->sr = bcf_sr_init(); args->sr->require_index = 1; if ( !argv[0] ) error("%s", usage()); if ( !bcf_sr_add_reader(args->sr,argv[0]) ) error("Error: %s\n", bcf_sr_strerror(args->sr->errnum)); args->hdr = args->sr->readers[0].header; args->nsample = bcf_hdr_nsamples(args->hdr); args->dflt_ploidy = 2; if ( ploidy_fname ) { args->ploidy = ploidy_init(ploidy_fname, args->dflt_ploidy); if ( !args->ploidy ) error("Could not read %s\n", ploidy_fname); } else { args->ploidy = ploidy_init_string( "X 1 60000 M 1\n" "X 2699521 154931043 M 1\n" "Y 1 59373566 M 1\n" "Y 1 59373566 F 0\n", args->dflt_ploidy); } args->nsex = ploidy_nsex(args->ploidy); args->sex2ploidy = (int*) malloc(sizeof(int)*args->nsex); args->max_ploidy = ploidy_max(args->ploidy); if ( args->guess && args->max_ploidy > 2 ) error("Sorry, ploidy %d not supported with -g\n", args->max_ploidy); args->ncounts = args->nsample * ((args->max_ploidy>2 ? args->max_ploidy : 2)+1); args->counts = (int*) malloc(sizeof(int)*args->ncounts); args->bg_counts = (count_t*) calloc(args->nsample,sizeof(count_t)); args->sex2prob = (float*) calloc(args->nsample*args->nsex,sizeof(float)); int i, nseq; for (i=0; i<args->nsample*args->nsex; i++) args->sex2prob[i] = 1; if ( args->verbose && args->guess ) printf("# [1]REG\t[2]Region\t[3]Sample\t[4]Het fraction\t[5]nHet\t[6]nHom\t[7]nMissing\n"); // First get the counts from expected haploid regions regidx_t *idx = ploidy_regions(args->ploidy); char **seqs = regidx_seq_names(idx, &nseq); for (i=0; i<nseq; i++) { regitr_t itr; regidx_overlap(idx, seqs[i], 0, UINT32_MAX, &itr); while ( itr.i < itr.n ) { if ( args->guess ) itr.i += process_region_guess(args, seqs[i], &itr); else itr.i += process_region_precise(args, seqs[i], &itr); } } // Get the counts from a PAR (the background diploid region) and see if the fraction // of hets is different if ( args->guess ) sex2prob_guess(args); for (i=0; i<args->nsample; i++) { int j, jmax = 0; float max = 0, sum = 0; for (j=0; j<args->nsex; j++) { sum += args->sex2prob[i*args->nsex+j]; if ( max < args->sex2prob[i*args->nsex+j] ) { jmax = j; max = args->sex2prob[i*args->nsex+j]; } } if ( args->verbose ) printf("%s\t%s\t%f\n", args->hdr->samples[i],ploidy_id2sex(args->ploidy,jmax),args->sex2prob[i*args->nsex+jmax]/sum); else printf("%s\t%s\n", args->hdr->samples[i],ploidy_id2sex(args->ploidy,jmax)); } bcf_sr_destroy(args->sr); ploidy_destroy(args->ploidy); destroy_regs(args); free(args->sex2ploidy); free(args->counts); free(args->bg_counts); free(args->gts); free(args->pls); free(args->sex2prob); free(args); return 0; }
bcf_srs() { m_sr = bcf_sr_init(); }
static void init_data(args_t *args) { args->aux.srs = bcf_sr_init(); // Open files for input and output, initialize structures if ( args->targets ) { if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) error("Failed to read the targets: %s\n", args->targets); if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) { args->aux.srs->targets->missed_reg_handler = print_missed_line; args->aux.srs->targets->missed_reg_data = args; } } if ( args->regions ) { if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) error("Failed to read the targets: %s\n", args->regions); } if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); args->aux.hdr = bcf_sr_get_header(args->aux.srs,0); int i; if ( args->samples_fname ) { set_samples(args, args->samples_fname, args->samples_is_file); if ( args->aux.flag&CALL_CONSTR_TRIO ) { if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname); fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams); } args->nsex = ploidy_nsex(args->ploidy); args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int)); args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int)); args->aux.ploidy = (uint8_t*) malloc(args->nsamples); for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2; for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2; } if ( args->samples_map ) { args->aux.hdr = bcf_hdr_subset(bcf_sr_get_header(args->aux.srs,0), args->nsamples, args->samples, args->samples_map); if ( !args->aux.hdr ) error("Error occurred while subsetting samples\n"); for (i=0; i<args->nsamples; i++) if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]); if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n"); } else { args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0)); for (i=0; i<args->nsamples; i++) if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) error("No such sample: %s\n", args->samples[i]); } args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->flag & CF_QCALL ) return; if ( args->flag & CF_MCALL ) mcall_init(&args->aux); if ( args->flag & CF_CCALL ) ccall_init(&args->aux); if ( args->flag&CF_GVCF ) { bcf_hdr_append(args->aux.hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); args->gvcf.rid = -1; args->gvcf.line = bcf_init1(); args->gvcf.gt = (int32_t*) malloc(2*sizeof(int32_t)*bcf_hdr_nsamples(args->aux.hdr)); for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) { args->gvcf.gt[2*i+0] = bcf_gt_unphased(0); args->gvcf.gt[2*i+1] = bcf_gt_unphased(0); } } bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS"); bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); bcf_hdr_write(args->out_fh, args->aux.hdr); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); }