int init(const char *opts, bcf_hdr_t *in, bcf_hdr_t *out) { in_hdr = in; out_hdr = out; bcf_hdr_append(out_hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">"); bcf_hdr_append(out_hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">"); return 0; }
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { in_hdr = in; out_hdr = out; n = bcf_hdr_nsamples(in_hdr); ngt=2*n; gt = (int *)malloc(ngt*sizeof(int)); bcf_hdr_append(out_hdr, "##INFO=<ID=AF,Number=1,Type=Float,Description=\"allele frequency estimated with EM\">"); bcf_hdr_append(out_hdr, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); bcf_hdr_append(out_hdr, "##FORMAT=<ID=DS,Number=1,Type=Float,Description=\"Estimated allelic dosage\">"); float Nh = 2. * n; af_start = 1/(log(Nh) + 0.5772156649 + 1 / (2*Nh) - 1 /(12*Nh*Nh)); return 0; }
static void init_header(bcf_hdr_t *hdr, const char *ori, int ori_type, const char *new_hdr_line) { if ( ori ) bcf_hdr_remove(hdr,ori_type,ori); bcf_hdr_append(hdr, new_hdr_line); }
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { in_hdr = in; out_hdr = out; bcf_hdr_append(out_hdr,"##INFO=<ID=INFO,Number=1,Type=Float,Description=\"IMPUTE2 info score\">"); return 0; }
/** * Copies contigs found in bcf header to another bcf header. */ void bcf_hdr_transfer_contigs(const bcf_hdr_t *hsrc, bcf_hdr_t *hdest) { vdict_t *d = (vdict_t*)hsrc->dict[BCF_DT_CTG]; int tid, m = kh_size(d); const char **names = (const char**) calloc(m,sizeof(const char*)); int len[m]; khint_t k; for (k=kh_begin(d); k<kh_end(d); k++) { if ( !kh_exist(d,k) ) continue; tid = kh_val(d,k).id; len[tid] = bcf_hrec_find_key(kh_val(d, k).hrec[0],"length"); int j; if ( sscanf(kh_val(d, k).hrec[0]->vals[len[tid]],"%d",&j) ) len[tid] = j; names[tid] = kh_key(d,k); } kstring_t s = {0,0,0}; for (tid=0; tid<m; tid++) { s.l = 0; ksprintf(&s, "##contig=<ID=%s,length=%d>", names[tid], len[tid]); bcf_hdr_append(hdest, s.s); } if (s.m) free(s.s); }
/** * Copies contigs found in bam header to bcf header. */ void bam_hdr_transfer_contigs_to_bcf_hdr(const bam_hdr_t *sh, bcf_hdr_t *vh) { kstring_t s = {0,0,0}; for (uint32_t i=0; i<bam_hdr_get_n_targets(sh); ++i) { s.l = 0; ksprintf(&s, "##contig=<ID=%s,length=%d>", bam_hdr_get_target_name(sh)[i], bam_hdr_get_target_len(sh)[i]); bcf_hdr_append(vh, s.s); } if (s.m) free(s.s); }
void ccall_init(call_t *call) { call->cdat = (ccall_t*) calloc(1,sizeof(ccall_t)); call_init_pl2p(call); call->cdat->p1 = bcf_p1_init(bcf_hdr_nsamples(call->hdr), call->ploidy); call->gts = (int*) calloc(bcf_hdr_nsamples(call->hdr)*2,sizeof(int)); // assuming at most diploid everywhere bcf_hdr_append(call->hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); bcf_hdr_append(call->hdr,"##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele frequency (assuming HWE)\">"); // Todo: groups not migrated to 'bcftools call' yet bcf_hdr_append(call->hdr,"##INFO=<ID=AF2,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first and second group ALT allele frequency (assuming HWE)\">"); bcf_hdr_append(call->hdr,"##INFO=<ID=AC1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the first ALT allele count (no HWE assumption)\">"); bcf_hdr_append(call->hdr,"##INFO=<ID=MQ,Number=1,Type=Integer,Description=\"Root-mean-square mapping quality of covering reads\">\n"); bcf_hdr_append(call->hdr,"##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n"); bcf_hdr_append(call->hdr,"##INFO=<ID=PV4,Number=4,Type=Float,Description=\"P-values for strand bias, baseQ bias, mapQ bias and tail distance bias\">\n"); bcf_hdr_append(call->hdr,"##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n"); bcf_hdr_append(call->hdr,"##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n"); // bcf_hdr_append(call->hdr,); // bcf_hdr_append(call->hdr,); bcf_hdr_append(call->hdr,"##INFO=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-forward , ref-reverse, alt-forward and alt-reverse bases\">"); return; }
/** *Set chromosome name */ void bcf_set_chrom(bcf_hdr_t *h, bcf1_t *v, const char* chrom) { vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; khint_t k = kh_get(vdict, d, chrom); if (k == kh_end(d)) { fprintf(stderr, "[E:%s:%d %s] contig '%s' is not defined in the header\n", __FILE__, __LINE__, __FUNCTION__, chrom); kstring_t contig = {0,0,0}; ksprintf(&contig, "##contig=<ID=%s,length=2147483647>", chrom); bcf_hdr_append(h, contig.s); if (contig.m) free(contig.s); k = kh_get(vdict, d, chrom); } v->rid = kh_val(d, k).id; };
void vcf_misc_hdr_add_cmd(bcf_hdr_t *hdr, const char *cmdline, const char *cwd) { char keystr[8], timestr[100]; time_t tnow; time(&tnow); strftime(timestr, sizeof(timestr), "%Y%m%d-%H:%M:%S", localtime(&tnow)); StrBuf sbuf; strbuf_alloc(&sbuf, 1024); strbuf_sprintf(&sbuf, "##mccortex_%s=<prev=\"NULL\",cmd=\"%s\",cwd=\"%s\"," "datetime=\"%s\",version="CTX_VERSION">\n", hex_rand_str(keystr, sizeof(keystr)), cmdline, cwd, timestr); bcf_hdr_append(hdr, sbuf.b); strbuf_dealloc(&sbuf); }
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { int c; char *fname = NULL; static struct option loptions[] = { {"exons",1,0,'e'}, {0,0,0,0} }; while ((c = getopt_long(argc, argv, "e:?h",loptions,NULL)) >= 0) { switch (c) { case 'e': fname = optarg; break; case 'h': case '?': default: fprintf(stderr,"%s", usage()); exit(1); break; } } if ( !fname ) { fprintf(stderr,"Missing the -e option.\n"); return -1; } in_hdr = in; out_hdr = out; int ret = bcf_hdr_append(out_hdr,"##INFO=<ID=OOF,Number=A,Type=Integer,Description=\"Frameshift Indels: out-of-frame (1), in-frame (0), not-applicable (-1 or missing)\">"); if ( ret!=0 ) { fprintf(stderr,"Error updating the header\n"); return -1; } exons = bcf_sr_regions_init(fname,1,0,1,2); if ( !exons ) { fprintf(stderr,"Error occurred while reading (was the file compressed with bgzip?): %s\n", fname); return -1; } return 0; }
int init(const char *opts, bcf_hdr_t *in, bcf_hdr_t *out) { in_hdr = in; out_hdr = out; int ret = bcf_hdr_append(out_hdr,"##INFO=<ID=OOF,Number=A,Type=Integer,Description=\"Frameshift Indels: out-of-frame (1), in-frame (0), not-applicable (-1 or missing)\">"); if ( ret!=0 ) { fprintf(stderr,"Error updating the header\n"); return -1; } char *fname = config_get_string(opts,"exons"); if ( !fname ) { fprintf(stderr,"No exons given, please run as \"bcftools annotate -p frameshifts:exons=path/to/file.tab.gz\".\n"); return -1; } wordexp_t wexp; wordexp(fname, &wexp, 0); if ( !wexp.we_wordc ) { fprintf(stderr,"No such file: %s\n", fname); return -1; } free(fname); fname = wexp.we_wordv[0]; exons = bcf_sr_regions_init(fname,1,0,1,2); if ( !exons ) { fprintf(stderr,"Error occurred while reading (was the file compressed with bgzip?): %s\n", fname); return -1; } wordfree(&wexp); return 0; }
int init(int argc, char **argv, bcf_hdr_t *in, bcf_hdr_t *out) { memset(&args,0,sizeof(args_t)); args.in_hdr = in; args.out_hdr = out; static struct option loptions[] = { {"tags",1,0,'t'}, {0,0,0,0} }; int c; while ((c = getopt_long(argc, argv, "?ht:T:l:cd",loptions,NULL)) >= 0) { switch (c) { case 't': args.tags |= parse_tags(&args,optarg); break; case 'h': case '?': default: error("%s", usage()); break; } } if ( optind != argc ) error(usage()); if ( !args.tags ) args.tags |= SET_AN|SET_AC|SET_NS|SET_AC_Hom|SET_AC_Het|SET_AC_Hemi|SET_AF; args.gt_id = bcf_hdr_id2int(args.in_hdr,BCF_DT_ID,"GT"); if ( args.gt_id<0 ) error("Error: GT field is not present\n"); if ( args.tags&SET_AN ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">"); if ( args.tags&SET_AC ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">"); if ( args.tags&SET_NS ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of samples with data\">"); if ( args.tags&SET_AC_Hom ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Hom,Number=A,Type=Integer,Description=\"Allele counts in homozygous genotypes\">"); if ( args.tags&SET_AC_Het ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Het,Number=A,Type=Integer,Description=\"Allele counts in heterozygous genotypes\">"); if ( args.tags&SET_AC_Hemi ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AC_Hemi,Number=A,Type=Integer,Description=\"Allele counts in hemizygous genotypes\">"); if ( args.tags&SET_AF ) bcf_hdr_append(args.out_hdr, "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Allele frequency\">"); return 0; }
int ctx_vcfcov(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL, *out_type = NULL; uint32_t max_allele_len = 0, max_gt_vars = 0; char *ref_path = NULL; bool low_mem = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; size_t i; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break; case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break; case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break; case 'M': cmd_check(!low_mem, cmd); low_mem = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)"); if(optind+2 > argc) cmd_print_usage("Require VCF and graph files"); if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN; if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS; status("[vcfcov] max allele length: %u; max number of variants: %u", max_allele_len, max_gt_vars); // open ref // index fasta with: samtools faidx ref.fa faidx_t *fai = fai_load(ref_path); if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path); // Open input VCF file const char *vcf_path = argv[optind++]; htsFile *vcffh = hts_open(vcf_path, "r"); if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path); bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh); if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path); // Test we can close and reopen files if(low_mem) { if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); } // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, low_mem ? -1 : (int64_t)ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *outfh = hts_open(out_path, modes_htslib[mode]); status("[vcfcov] Output format: %s", hsmodes_htslib[mode]); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_COVGS); // // Set up tag names // // *R => ref, *A => alt sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size); // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK> // - K29_kcov is empirical kmer coverage // - K29_nkmers is the number of kmers in the sample // - mean_read_length is the mean read length in bases char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20]; sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size); sprintf(sample_rlk_tag, "mean_read_length"); // // Load kmers if we are using --low-mem // VcfCovStats st; memset(&st, 0, sizeof(st)); VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag, .kcov_alt_tag = kcov_alt_tag, .max_allele_len = max_allele_len, .max_gt_vars = max_gt_vars, .load_kmers_only = false}; if(low_mem) { status("[vcfcov] Loading kmers from VCF+ref"); prefs.load_kmers_only = true; vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai, NULL, &prefs, &st, &db_graph); // Close files hts_close(vcffh); bcf_hdr_destroy(vcfhdr); // Re-open files if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); prefs.load_kmers_only = false; } // // Load graphs // GraphLoadingStats gstats; memset(&gstats, 0, sizeof(gstats)); GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.must_exist_in_graph = low_mem; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &gstats); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // // Set up VCF header / graph matchup // size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t)); // Add samples to vcf header bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr); bcf_hrec_t *hrec; int sid; char hdrstr[200]; for(i = 0; i < db_graph.num_of_cols; i++) { char *sname = db_graph.ginfo[i].sample_name.b; if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) { bcf_hdr_add_sample(outhdr, sname); sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname); } samplehdrids[i] = sid; // Add SAMPLE field hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE"); if(hrec == NULL) { sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname, sample_kcov_tag, gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0, sample_nk_tag, gstats.nkmers[i], sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length); bcf_hdr_append(outhdr, hdrstr); } else { // mean kcovg sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr); // num kmers sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr); // mean read length in kmers sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length); vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr); } status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]); } // Add genotype format fields // One field per alternative allele sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_ref_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_alt_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); bcf_hdr_set_version(outhdr, "VCFv4.2"); // Add command string to header vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd()); if(bcf_hdr_write(outfh, outhdr) != 0) die("Cannot write header to: %s", futil_outpath_str(out_path)); status("[vcfcov] Reading %s and adding coverage", vcf_path); // Reset stats and get coverage memset(&st, 0, sizeof(st)); vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai, samplehdrids, &prefs, &st, &db_graph); // Print statistics char ns0[50], ns1[50]; status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0)); status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0)); status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0)); status("[vcfcov] ALTs used: %s / %s (%.2f%%)", ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0); status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len, ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0); status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)", max_gt_vars, db_graph.kmer_size, ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0); status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)", ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0); status("[vcfcov] Saved to: %s\n", out_path); ctx_free(samplehdrids); graph_loading_stats_destroy(&gstats); bcf_hdr_destroy(vcfhdr); bcf_hdr_destroy(outhdr); hts_close(vcffh); hts_close(outfh); fai_destroy(fai); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ingest1(const char *input,const char *output,char *ref,bool exit_on_mismatch=true) { cerr << "Input: " << input << "\tOutput: "<<output<<endl; kstream_t *ks; kstring_t str = {0,0,0}; gzFile fp = gzopen(input, "r"); VarBuffer vbuf(1000); int prev_rid = -1; if(fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } char *out_fname = (char *)malloc(strlen(output)+5); strcpy(out_fname,output); strcat(out_fname,".tmp"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("depth: %s\n",out_fname); gzFile depth_fp = gzopen(out_fname, "wb1"); strcpy(out_fname,output); strcat(out_fname,".bcf"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("variants: %s\n",out_fname); htsFile *variant_fp=hts_open(out_fname,"wb1"); if(variant_fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } ks = ks_init(fp); htsFile *hfp=hts_open(input, "r"); bcf_hdr_t *hdr_in = bcf_hdr_read(hfp); hts_close(hfp); //this is a hack to fix gvcfs where AD is incorrectly defined in the header. (vcf4.2 does not technically allow Number=R) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"AD"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)\">") == 0); //this is a hack to fix broken gvcfs where GQ is incorrectly labelled as float (v4.3 spec says it should be integer) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"GQ"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">") == 0); // bcf_hdr_t *hdr_out=hdr_in; bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr_in); remove_hdr_lines(hdr_out,BCF_HL_INFO); remove_hdr_lines(hdr_out,BCF_HL_FLT); bcf_hdr_sync(hdr_out); //here we add FORMAT/PF. which is the pass filter flag for alts. assert( bcf_hdr_append(hdr_out,"##FORMAT=<ID=PF,Number=A,Type=Integer,Description=\"variant was PASS filter in original sample gvcf\">") == 0); args_t *norm_args = init_vcfnorm(hdr_out,ref); norm_args->check_ref |= CHECK_REF_WARN; bcf1_t *bcf_rec = bcf_init(); bcf_hdr_write(variant_fp, hdr_out); kstring_t work1 = {0,0,0}; int buf[5]; ks_tokaux_t aux; int ndec=0; int ref_len,alt_len; while( ks_getuntil(ks, '\n', &str, 0) >=0) { // fprintf(stderr,"%s\n",str.s); if(str.s[0]!='#') { char *ptr = kstrtok(str.s,"\t",&aux);//chrom ptr = kstrtok(NULL,NULL,&aux);//pos work1.l=0; kputsn(str.s,ptr-str.s-1, &work1); buf[0] = bcf_hdr_name2id(hdr_in, work1.s); assert( buf[0]>=0); buf[1]=atoi(ptr)-1; ptr = kstrtok(NULL,NULL,&aux);//ID ptr = kstrtok(NULL,NULL,&aux);//REF ref_len=0; while(ptr[ref_len]!='\t') ref_len++; ptr = kstrtok(NULL,NULL,&aux);//ALT bool is_variant=false; alt_len=0; while(ptr[alt_len]!='\t') alt_len++; if(ptr[0]!='.') is_variant=true; char * QUAL_ptr = kstrtok(NULL, NULL, &aux); assert (QUAL_ptr != NULL); for(int i=0;i<2;i++) ptr = kstrtok(NULL,NULL,&aux);// gets us to INFO //find END if it is there char *end_ptr=strstr(ptr,"END=") ; if(end_ptr!=NULL) buf[2]=atoi(end_ptr+4)-1; else buf[2]=buf[1]+alt_len-1; ptr = kstrtok(NULL,NULL,&aux);//FORMAT //find index of DP (if present) //if not present, dont output anything (indels ignored) char *DP_ptr = find_format(ptr,"DP"); int GQX = 0; int QUAL = 0; // AH: change code to use the minimum of GQ and QUAL fields if // GQX is not defined. See here: // https://support.basespace.illumina.com/knowledgebase/articles/144844-vcf-file // "GQXGenotype quality. GQX is the minimum of the GQ value // and the QUAL column. In general, these are similar values; // taking the minimum makes GQX the more conservative measure of // genotype quality." if(DP_ptr!=NULL) { buf[3]=atoi(DP_ptr); char *GQX_ptr = find_format(ptr,"GQX"); if (GQX_ptr == NULL) { GQX_ptr = find_format(ptr,"GQ"); GQX = atoi(GQX_ptr); if (QUAL_ptr[0] != '.') { QUAL = atoi(QUAL_ptr); if (QUAL < GQX) GQX = QUAL; } } else { GQX = atoi(GQX_ptr); } //trying to reduce entropy on GQ to get better compression performance. //1. rounds down to nearest 10. //2. sets gq to min(gq,100). buf[4]=GQX/10; buf[4]*=10; if(buf[4]>100) buf[4]=100; // printf("%d\t%d\t%d\t%d\t%d\n",buf[0],buf[1],buf[2],buf[3],buf[4]); if(gzwrite(depth_fp,buf,5*sizeof(int))!=(5*sizeof(int))) die("ERROR: problem writing "+(string)out_fname+".tmp"); } if(is_variant) {//wass this a variant? if so write it out to the bcf norm_args->ntotal++; vcf_parse(&str,hdr_in,bcf_rec); // cerr<<bcf_rec->rid<<":"<<bcf_rec->pos<<endl; if(prev_rid!=bcf_rec->rid) vbuf.flush(variant_fp,hdr_out); else vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); prev_rid=bcf_rec->rid; int32_t pass = bcf_has_filter(hdr_in, bcf_rec, "."); bcf_update_format_int32(hdr_out,bcf_rec,"PF",&pass,1); bcf_update_filter(hdr_out,bcf_rec,NULL,0); if(bcf_rec->n_allele>2) {//split multi-allelics (using vcfnorm.c from bcftools1.3 norm_args->nsplit++; split_multiallelic_to_biallelics(norm_args,bcf_rec ); for(int i=0;i<norm_args->ntmp_lines;i++){ remove_info(norm_args->tmp_lines[i]); if(realign(norm_args,norm_args->tmp_lines[i]) != ERR_REF_MISMATCH) ndec+=decompose(norm_args->tmp_lines[i],hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } } else { remove_info(bcf_rec); if( realign(norm_args,bcf_rec) != ERR_REF_MISMATCH) ndec+=decompose(bcf_rec,hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); } } } vbuf.flush(variant_fp,hdr_out); bcf_hdr_destroy(hdr_in); bcf_hdr_destroy(hdr_out); bcf_destroy1(bcf_rec); ks_destroy(ks); gzclose(fp); gzclose(depth_fp); free(str.s); free(work1.s); hts_close(variant_fp); destroy_data(norm_args); fprintf(stderr,"Variant lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", norm_args->ntotal,norm_args->nsplit,norm_args->nchanged,norm_args->nskipped); fprintf(stderr,"Decomposed %d MNPs\n", ndec); fprintf(stderr,"Indexing %s\n",out_fname); bcf_index_build(out_fname, BCF_LIDX_SHIFT); free(out_fname); return 0; }
//{{{ void print_query_result_offset(uint32_t *mask, void print_query_result_offset(uint32_t *mask, uint32_t mask_len, uint32_t *vids, struct gqt_query *q, uint32_t **counts, uint32_t *id_lens, uint32_t *U_R, uint32_t U_R_len, char **id_query_list, char **gt_query_list, uint32_t num_qs, uint32_t num_fields, char *off_file_name, char *source_file, char *full_cmd) { struct off_file *off_f = open_off_file(off_file_name); struct bcf_file bcf_f = init_bcf_file(source_file); char *sample_names = NULL; uint32_t i,j,k,line_idx,bytes, bit_i = 0; int r; for (i = 0; i < U_R_len; ++i) { if (i == 0 ) r = asprintf(&sample_names, "%s", bcf_f.hdr->samples[U_R[i]]); else r = asprintf(&sample_names, "%s,%s", sample_names, bcf_f.hdr->samples[U_R[i]]); if (r == -1) err(EX_OSERR, "asprintf error"); } if (bcf_hdr_set_samples(bcf_f.hdr, sample_names, 0) != 0) errx(EX_DATAERR, "Error setting samples: %s\n", source_file); char *info_s; for (i = 0; i < num_qs; i++) { if ( q[i].variant_op == p_count ) { r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Integer," "Description=\"GQT count result from " "phenotype:'%s' genotype:'%s'\">", i, id_query_list[i], gt_query_list[i]); if (r == -1) err(EX_OSERR, "asprintf error"); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); } else if ( q[i].variant_op == p_pct ) { r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Float," "Description=\"GQT percent result from " "phenotype:'%s' genotype:'%s'\">", i, id_query_list[i], gt_query_list[i]); if (r == -1) err(EX_OSERR, "asprintf error"); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); } else if ( q[i].variant_op == p_maf ) { r = asprintf(&info_s, "##INFO=<ID=GQT_%u,Number=1,Type=Float," "Description=\"GQT maf result from " "phenotype:'%s' genotype:'%s'\">", i, id_query_list[i], gt_query_list[i]); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); } } r = asprintf(&info_s, "##%s_queryVersion=%s", PROGRAM_NAME, VERSION); if (r == -1) err(EX_OSERR, "asprintf error"); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); r = asprintf(&info_s, "##%s_queryCommand=%s", PROGRAM_NAME, full_cmd); if (r == -1) err(EX_OSERR, "asprintf error"); if (bcf_hdr_append(bcf_f.hdr, info_s) != 0) errx(EX_DATAERR, "Error updating header: %s\n", source_file); htsFile *out_f = hts_open("-","w"); if ( !out_f ) err(EX_DATAERR, "Could open output file"); bcf_hdr_write(out_f, bcf_f.hdr); bcf_f.line = bcf_init1(); for (i=0; i < mask_len; ++i) { bytes = mask[i]; if (bytes == 0) continue; /* skip a bunch of ops if you can */ for (j=0; j < 32; j++) { if (bytes & 1 << (31 - j)) { line_idx = i*32+j; r = goto_bcf_line(&bcf_f, off_f, line_idx); if (r == -1) err(EX_NOINPUT, "Error seeking file '%s'", bcf_f.file_name); r = get_bcf_line(&bcf_f); if (r == -1) err(EX_NOINPUT, "Error reading file '%s'", bcf_f.file_name); for (k=0; k < num_qs; k++) { r = asprintf(&info_s, "GQT_%u", k); if (r == -1) err(EX_OSERR, "asprintf error"); if ( q[k].variant_op == p_count ) { int32_t v = counts[k][line_idx]; if (bcf_update_info_int32(bcf_f.hdr, bcf_f.line, info_s, &v, 1) != 0) errx(EX_DATAERR, "Error adding to info field: %s\n", bcf_f.file_name); } else if (q[k].variant_op == p_pct) { float v = ((float)counts[k][line_idx])/ ((float) id_lens[k]); if (bcf_update_info_float(bcf_f.hdr, bcf_f.line, info_s, &v, 1) != 0) errx(EX_DATAERR, "Error adding to info field: %s\n", bcf_f.file_name); } else if (q[k].variant_op == p_maf) { float v = ((float)counts[k][line_idx])/ (((float) id_lens[k])*2.0); if (bcf_update_info_float(bcf_f.hdr, bcf_f.line, info_s, &v, 1) != 0) errx(EX_DATAERR, "Error adding to info field: %s\n", bcf_f.file_name); } } bcf_write(out_f, bcf_f.hdr, bcf_f.line); } bit_i++; if (bit_i == num_fields) break; } if (bit_i == num_fields) break; } hts_close(out_f); destroy_off_file(off_f); }
// only if annotation database is VCF/BCF file, header_in has values or else header_in == NULL anno_col_t *init_columns(const char *rules, bcf_hdr_t *header_in, bcf_hdr_t *header_out, int *ncols, enum anno_type type) { assert(rules != NULL); if (type == anno_is_vcf && header_in == NULL) { error("Inconsistent file type!"); } char *ss = (char*)rules, *se = ss; int nc = 0; anno_col_t *cols = NULL; kstring_t tmp = KSTRING_INIT; kstring_t str = KSTRING_INIT; int i = -1; while (*ss) { if ( *se && *se!=',' ) { se++; continue; } int replace = REPLACE_ALL; if ( *ss=='+') { replace = REPLACE_MISSING; ss++; } else if (*ss=='-') { replace = REPLACE_EXISTING; ss++; } i++; str.l = 0; kputsn(ss, se-ss, &str); if ( !str.s[0] ) { warnings("Empty tag in %s", rules); } else if ( !strcasecmp("CHROM", str.s) || !strcasecmp("POS", str.s) || !strcasecmp("FROM", str.s) || !strcasecmp("TO", str.s) || !strcasecmp("REF", str.s) || !strcasecmp("ALT", str.s) || !strcasecmp("FILTER", str.s) || !strcasecmp("QUAL", str.s)) { warnings("Skip tag %s", str.s); } else if ( !strcasecmp("ID", str.s) ) { nc++; cols = (struct anno_col*) realloc(cols, sizeof(struct anno_col)* (nc)); struct anno_col *col = &cols[nc-1]; col->icol = i; col->replace = replace; col->setter = type == anno_is_vcf ? vcf_setter_id : setter_id; col->hdr_key = strdup(str.s); } else if (!strcasecmp("INFO", str.s) || !strcasecmp("FORMAT", str.s) ) { error("do not support annotate all INFO,FORMAT fields. todo INFO/TAG instead\n"); } else if (!strncasecmp("FORMAT/", str.s, 7) || !strncasecmp("FMT/", str.s, 4)) { char *key = str.s + (!strncasecmp("FMT", str.s, 4) ? 4 : 7); if (!strcasecmp("GT", key)) error("It is not allowed to change GT tag."); int hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s); if ( !bcf_hdr_idinfo_exists(header_out, BCF_HL_FMT, hdr_id) ) { if ( type == anno_is_vcf ) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(header_in, BCF_HL_FMT, "ID", str.s, NULL); if ( !hrec ) error("The tag \"%s\" is not defined in header: %s\n", str.s, rules); tmp.l = 0; bcf_hrec_format(hrec, &tmp); bcf_hdr_append(header_out, tmp.s); bcf_hdr_sync(header_out); hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s); assert( bcf_hdr_idinfo_exists(header_out, BCF_HL_FMT, hdr_id) ); } else { error("The tag \"%s\" is not defined in header: %s\n", str.s, rules); } } //int hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, key); nc++; cols = (struct anno_col*) realloc(cols, sizeof(struct anno_col)*(nc)); struct anno_col *col = &cols[nc-1]; col->icol = -1; col->replace = replace; col->hdr_key = strdup(key); switch ( bcf_hdr_id2type(header_out, BCF_HL_FMT, hdr_id) ) { case BCF_HT_INT: col->setter = type == anno_is_vcf ? vcf_setter_format_int : setter_format_int; break; case BCF_HT_REAL: col->setter = type == anno_is_vcf ? vcf_setter_format_real : setter_format_real; break; case BCF_HT_STR: col->setter = type == anno_is_vcf ? vcf_setter_format_str : setter_format_str; break; default : error("The type of %s not recognised (%d)\n", str.s, bcf_hdr_id2type(header_out, BCF_HL_FMT, hdr_id)); } } else if ( !strncasecmp("INFO/", str.s, 5) ) { memmove(str.s, str.s+5, str.l-4); str.l -= 4; int hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s); if ( !bcf_hdr_idinfo_exists(header_out, BCF_HL_INFO, hdr_id) ) { if ( type == anno_is_vcf ) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(header_in, BCF_HL_INFO, "ID", str.s, NULL); if ( !hrec ) error("The tag \"%s\" is not defined in header: %s\n", str.s, rules); tmp.l = 0; bcf_hrec_format(hrec, &tmp); bcf_hdr_append(header_out, tmp.s); bcf_hdr_sync(header_out); hdr_id = bcf_hdr_id2int(header_out, BCF_DT_ID, str.s); assert( bcf_hdr_idinfo_exists(header_out, BCF_HL_INFO, hdr_id) ); } else { error("The tag \"%s\" is not defined in header: %s\n", str.s, rules); } } nc++; cols = (struct anno_col*) realloc(cols, sizeof(struct anno_col)*(nc)); struct anno_col *col = &cols[nc-1]; col->icol = i; col->replace = replace; col->hdr_key = strdup(str.s); col->number = bcf_hdr_id2length(header_out, BCF_HL_INFO, hdr_id); switch ( bcf_hdr_id2type(header_out, BCF_HL_INFO, hdr_id) ) { case BCF_HT_FLAG: col->setter = type == anno_is_vcf ? vcf_setter_info_flag : setter_info_flag; break; case BCF_HT_INT: col->setter = type == anno_is_vcf ? vcf_setter_info_int : setter_info_int; break; case BCF_HT_REAL: col->setter = type == anno_is_vcf ? vcf_setter_info_real : setter_info_real; break; case BCF_HT_STR: col->setter = type == anno_is_vcf ? vcf_setter_info_str : setter_info_str; break; default: error("The type of %s not recognised (%d)\n", str.s, bcf_hdr_id2type(header_out, BCF_HL_INFO, hdr_id)); } } if ( !*se ) break; ss = ++se; } *ncols = nc; if (str.m) free(str.s); if (tmp.m) free(tmp.s); return cols; }
/* * Performs pileup * @param conf configuration for this pileup * @param n number of files specified in fn * @param fn filenames */ static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_hdr_t *h = NULL; /* header of first file in input list */ char *ref; void *rghash = NULL; FILE *pileup_fp = NULL; bcf_callaux_t *bca = NULL; bcf_callret1_t *bcr = NULL; bcf_call_t bc; htsFile *bcf_fp = NULL; bcf_hdr_t *bcf_hdr = NULL; bam_sample_t *sm = NULL; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(mplp_aux_t*)); plp = calloc(n, sizeof(bam_pileup1_t*)); n_plp = calloc(n, sizeof(int)); sm = bam_smpl_init(); if (n == 0) { fprintf(stderr,"[%s] no input file/data given\n", __func__); exit(1); } // read the header of each file in the list and initialize data for (i = 0; i < n; ++i) { bam_hdr_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = sam_open(fn[i], "rb"); if ( !data[i]->fp ) { fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(1); } hts_set_fai_filename(data[i]->fp, conf->fai_fname); data[i]->conf = conf; h_tmp = sam_hdr_read(data[i]->fp); if ( !h_tmp ) { fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(1); } data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(1); } if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg); exit(1); } if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; hts_idx_destroy(idx); } if (i == 0) h = h_tmp; /* save the header of first file in list */ else { // FIXME: to check consistency bam_hdr_destroy(h_tmp); } } // allocate data storage proportionate to number of samples being studied sm->n gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_BCF) { const char *mode; if ( conf->flag & MPLP_VCF ) mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz"; // uncompressed VCF or compressed VCF else mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb"; // uncompressed BCF or compressed BCF bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); if (bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(1); } bcf_hdr = bcf_hdr_init("w"); kstring_t str = {0,0,0}; ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version()); bcf_hdr_append(bcf_hdr, str.s); str.l = 0; ksprintf(&str, "##samtoolsCommand=samtools mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&str, " %s", conf->argv[i]); kputc('\n', &str); bcf_hdr_append(bcf_hdr, str.s); if (conf->fai_fname) { str.l = 0; ksprintf(&str, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(bcf_hdr, str.s); } // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<h->n_targets; i++) { str.l = 0; ksprintf(&str, "##contig=<ID=%s,length=%d>", h->target_name[i], h->target_len[i]); bcf_hdr_append(bcf_hdr, str.s); } free(str.s); bcf_hdr_append(bcf_hdr,"##ALT=<ID=X,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); for (i=0; i<sm->n; i++) bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); bcf_hdr_add_sample(bcf_hdr, NULL); bcf_hdr_write(bcf_fp, bcf_hdr); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; bc.bcf_hdr = bcf_hdr; bc.n = sm->n; bc.PL = malloc(15 * sm->n * sizeof(*bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4); bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample bc.DPR = malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<sm->n; i++) bcr[i].DPR = bc.DPR + (i+1)*B2B_MAX_ALLELES; } } } else { pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout; if (pileup_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); exit(1); } } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; // begin pileup iter = bam_mplp_init(n, mplp_func, (void**)data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_BCF) { int total_depth, _ref0, ref16; for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = seq_nt16_table[_ref0]; bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bc.tid = tid; bc.pos = pos; bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); } } } else { fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j, cnt; for (j = cnt = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; } fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); } else { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_MAPQ) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if ( c < conf->min_baseQ ) continue; c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_POS) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putc(',', pileup_fp); fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putc('\n', pileup_fp); } } // clean up free(bc.tmp.s); bcf_destroy1(bcf_rec); if (bcf_fp) { hts_close(bcf_fp); bcf_hdr_destroy(bcf_hdr); bcf_call_destroy(bca); free(bc.PL); free(bc.DP4); free(bc.DPR); free(bc.fmt_arr); free(bcr); } if (pileup_fp && conf->output_fname) fclose(pileup_fp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bam_mplp_destroy(iter); bam_hdr_destroy(h); for (i = 0; i < n; ++i) { sam_close(data[i]->fp); if (data[i]->iter) hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return ret; }
static void init_data(args_t *args) { int i; args->hdr = args->files->readers[0].header; if (args->calc_ac && args->update_info) { bcf_hdr_append(args->hdr,"##INFO=<ID=AC,Number=A,Type=Integer,Description=\"Allele count in genotypes\">"); bcf_hdr_append(args->hdr,"##INFO=<ID=AN,Number=1,Type=Integer,Description=\"Total number of alleles in called genotypes\">"); } bcf_hdr_append_version(args->hdr, args->argc, args->argv, "bcftools_view"); // setup sample data if (args->sample_names) { void *hdr_samples = khash_str2int_init(); for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) khash_str2int_inc(hdr_samples, bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)); void *exclude = (args->sample_names[0]=='^') ? khash_str2int_init() : NULL; int nsmpl; char **smpl = NULL; args->samples = NULL; args->n_samples = 0; smpl = hts_readlist(exclude ? &args->sample_names[1] : args->sample_names, args->sample_is_file, &nsmpl); if ( !smpl ) { error("Could not read the list: \"%s\"\n", exclude ? &args->sample_names[1] : args->sample_names); } if ( exclude ) { for (i=0; i<nsmpl; i++) { if (!khash_str2int_has_key(hdr_samples,smpl[i])) { if (args->force_samples) { fprintf(stderr, "Warn: exclude called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); } else { error("Error: exclude called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } } khash_str2int_inc(exclude, smpl[i]); } for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) { if ( exclude && khash_str2int_has_key(exclude,bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)) ) continue; args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*)); args->samples[args->n_samples++] = strdup(bcf_hdr_int2id(args->hdr,BCF_DT_SAMPLE,i)); } khash_str2int_destroy(exclude); } else { for (i=0; i<nsmpl; i++) { if (!khash_str2int_has_key(hdr_samples,smpl[i])) { if (args->force_samples) { fprintf(stderr, "Warn: subset called for sample that does not exist in header: \"%s\"... skipping\n", smpl[i]); continue; } else { error("Error: subset called for sample that does not exist in header: \"%s\". Use \"--force-samples\" to ignore this error.\n", smpl[i]); } } args->samples = (char**) realloc(args->samples, (args->n_samples+1)*sizeof(const char*)); args->samples[args->n_samples++] = strdup(smpl[i]); } } for (i=0; i<nsmpl; i++) free(smpl[i]); free(smpl); khash_str2int_destroy(hdr_samples); if (args->n_samples == 0) { fprintf(stderr, "Warn: subsetting has removed all samples\n"); args->sites_only = 1; } } if (args->n_samples) args->imap = (int*)malloc(args->n_samples * sizeof(int)); // determine variant types to include/exclude if (args->include_types || args->exclude_types) { if (args->include_types && args->exclude_types) { fprintf(stderr, "Error: only supply one of --include-types, --exclude-types options\n"); exit(1); } char **type_list = 0; int m = 0, n = 0; const char *q, *p; for (q = p = args->include_types ? args->include_types : args->exclude_types;; ++p) { if (*p == ',' || *p == 0) { if (m == n) { m = m? m<<1 : 16; type_list = (char**)realloc(type_list, m * sizeof(char*)); } type_list[n] = (char*)calloc(p - q + 1, 1); strncpy(type_list[n++], q, p - q); q = p + 1; if (*p == 0) break; } } type_list = (char**)realloc(type_list, n * sizeof(char*)); if (args->include_types) { args->include = 0; for (i = 0; i < n; ++i) { if (strcmp(type_list[i], "snps") == 0) args->include |= VCF_SNP; else if (strcmp(type_list[i], "indels") == 0) args->include |= VCF_INDEL; else if (strcmp(type_list[i], "mnps") == 0) args->include |= VCF_MNP; else if (strcmp(type_list[i], "other") == 0) args->include |= VCF_OTHER; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } } if (args->exclude_types) { args->exclude = 0; for (i = 0; i < n; ++i) { if (strcmp(type_list[i], "snps") == 0) args->exclude |= VCF_SNP; else if (strcmp(type_list[i], "indels") == 0) args->exclude |= VCF_INDEL; else if (strcmp(type_list[i], "mnps") == 0) args->exclude |= VCF_MNP; else if (strcmp(type_list[i], "other") == 0) args->exclude |= VCF_OTHER; else { fprintf(stderr, "[E::%s] unknown type\n", type_list[i]); fprintf(stderr, "Accepted types are snps, indels, mnps, other\n"); exit(1); } } } for (i = 0; i < n; ++i) free(type_list[i]); free(type_list); } // setup output char modew[8]; strcpy(modew, "w"); if (args->clevel >= 0 && args->clevel <= 9) sprintf(modew + 1, "%d", args->clevel); if (args->output_type==FT_BCF) strcat(modew, "bu"); // uncompressed BCF else if (args->output_type & FT_BCF) strcat(modew, "b"); // compressed BCF else if (args->output_type & FT_GZ) strcat(modew,"z"); // compressed VCF args->out = hts_open(args->fn_out ? args->fn_out : "-", modew); if ( !args->out ) error("%s: %s\n", args->fn_out,strerror(errno)); // headers: hdr=full header, hsub=subset header, hnull=sites only header if (args->sites_only){ args->hnull = bcf_hdr_subset(args->hdr, 0, 0, 0); bcf_hdr_remove(args->hnull, BCF_HL_FMT, NULL); } if (args->n_samples > 0) { args->hsub = bcf_hdr_subset(args->hdr, args->n_samples, args->samples, args->imap); if ( !args->hsub ) error("Error occurred while subsetting samples\n"); if ( args->n_samples != bcf_hdr_nsamples(args->hsub) ) { int i; for (i=0; i<args->n_samples; i++) if ( args->imap[i]<0 ) error("Error: No such sample: \"%s\"\n", args->samples[i]); } } if ( args->filter_str ) args->filter = filter_init(args->hdr, args->filter_str); }
int main(int argc, char **argv) { char *fname = argc>1 ? argv[1] : "/dev/null"; htsFile *fp = hts_open(fname, "w"); bcf_hdr_t *hdr1, *hdr2; hdr1 = bcf_hdr_init("w"); hdr2 = bcf_hdr_init("w"); // Add two shared and two private annotations bcf_hdr_append(hdr1, "##contig=<ID=1>"); bcf_hdr_append(hdr1, "##contig=<ID=2>"); bcf_hdr_append(hdr2, "##contig=<ID=2>"); bcf_hdr_append(hdr2, "##contig=<ID=1>"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT1,Description=\"Filter 1\">"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT2,Description=\"Filter 2\">"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT3,Description=\"Filter 3\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT4,Description=\"Filter 4\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT3,Description=\"Filter 3\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT2,Description=\"Filter 2\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF1,Number=.,Type=Integer,Description=\"Info 1\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF4,Number=.,Type=Integer,Description=\"Info 4\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT1,Number=.,Type=Integer,Description=\"FMT 1\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT4,Number=.,Type=Integer,Description=\"FMT 4\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">"); bcf_hdr_add_sample(hdr1,"SMPL1"); bcf_hdr_add_sample(hdr1,"SMPL2"); bcf_hdr_add_sample(hdr2,"SMPL1"); bcf_hdr_add_sample(hdr2,"SMPL2"); bcf_hdr_sync(hdr1); bcf_hdr_sync(hdr2); hdr2 = bcf_hdr_merge(hdr2,hdr1); bcf_hdr_sync(hdr2); if ( bcf_hdr_write(fp, hdr2)!=0 ) error("Failed to write to %s\n", fname); bcf1_t *rec = bcf_init1(); rec->rid = bcf_hdr_name2id(hdr1, "1"); rec->pos = 0; bcf_update_alleles_str(hdr1, rec, "G,A"); int32_t tmpi[3]; tmpi[0] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT1"); tmpi[1] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"); tmpi[2] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT3"); bcf_update_filter(hdr1, rec, tmpi, 3); tmpi[0] = 1; bcf_update_info_int32(hdr1, rec, "INF1", tmpi, 1); tmpi[0] = 2; bcf_update_info_int32(hdr1, rec, "INF2", tmpi, 1); tmpi[0] = 3; bcf_update_info_int32(hdr1, rec, "INF3", tmpi, 1); tmpi[0] = tmpi[1] = 1; bcf_update_format_int32(hdr1, rec, "FMT1", tmpi, 2); tmpi[0] = tmpi[1] = 2; bcf_update_format_int32(hdr1, rec, "FMT2", tmpi, 2); tmpi[0] = tmpi[1] = 3; bcf_update_format_int32(hdr1, rec, "FMT3", tmpi, 2); bcf_remove_filter(hdr1, rec, bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"), 0); bcf_update_info_int32(hdr1, rec, "INF2", NULL, 0); bcf_update_format_int32(hdr1, rec, "FMT2", NULL, 0); bcf_translate(hdr2, hdr1, rec); if ( bcf_write(fp, hdr2, rec)!=0 ) error("Faild to write to %s\n", fname); // Clean bcf_destroy1(rec); bcf_hdr_destroy(hdr1); bcf_hdr_destroy(hdr2); int ret; if ( (ret=hts_close(fp)) ) { fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); exit(ret); } return 0; }
static int mpileup(mplp_conf_t *conf) { if (conf->nfiles == 0) { fprintf(stderr,"[%s] no input file/data given\n", __func__); exit(EXIT_FAILURE); } mplp_ref_t mp_ref = MPLP_REF_INIT; conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t)); conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*)); conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*)); conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int)); // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index // must be kept in the memory for the whole time which can be a problem with many bams. // Therefore if none or only one region is requested, we initialize the bam iterator as // before and free the index. Only when multiple regions are queried, we keep the index. int nregs = 0; if ( conf->reg_fname ) { if ( conf->reg_is_file ) { conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL); if ( !conf->reg ) { fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } else { conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) { fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } nregs = regidx_nregs(conf->reg); conf->reg_itr = regitr_init(conf->reg); regitr_loop(conf->reg_itr); // region iterator now positioned at the first region } // read the header of each file in the list and initialize data // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least! bam_hdr_t *hdr = NULL; // header of first file in input list int i; for (i = 0; i < conf->nfiles; ++i) { bam_hdr_t *h_tmp; conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t)); conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb"); if ( !conf->mplp_data[i]->fp ) { fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); exit(EXIT_FAILURE); } if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) { fprintf(stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); exit(EXIT_FAILURE); } conf->mplp_data[i]->conf = conf; conf->mplp_data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(conf->mplp_data[i]->fp); if ( !h_tmp ) { fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]); if ( conf->mplp_data[i]->bam_id<0 ) { // no usable readgroups in this bam, it can be skipped sam_close(conf->mplp_data[i]->fp); free(conf->mplp_data[i]); bam_hdr_destroy(h_tmp); free(conf->files[i]); if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1)); conf->nfiles--; i--; continue; } if (conf->reg) { hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]); if (idx == NULL) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s); if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } if ( nregs==1 ) // no need to keep the index in memory hts_idx_destroy(idx); else conf->mplp_data[i]->idx = idx; } if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */ else { // FIXME: check consistency between h and h_tmp bam_hdr_destroy(h_tmp); // we store only the first file's header; it's (alleged to be) // compatible with the i-th file's target_name lookup needs conf->mplp_data[i]->h = hdr; } } // allocate data storage proportionate to number of samples being studied sm->n bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n); conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); if (conf->bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); } if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads); // BCF header creation conf->bcf_hdr = bcf_hdr_init("w"); conf->buf.l = 0; if (conf->record_cmd_line) { ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version()); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); conf->buf.l = 0; ksprintf(&conf->buf, "##bcftoolsCommand=mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]); kputc('\n', &conf->buf); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } if (conf->fai_fname) { conf->buf.l = 0; ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } // Translate BAM @SQ tags to BCF ##contig tags // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<hdr->n_targets; i++) { conf->buf.l = 0; ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } conf->buf.l = 0; bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); if ( conf->fmt_flag&B2B_FMT_AD ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">"); if ( conf->fmt_flag&B2B_FMT_ADF ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_FMT_ADR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">"); if ( conf->fmt_flag&B2B_INFO_AD ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">"); if ( conf->fmt_flag&B2B_INFO_ADF ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_INFO_ADR ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">"); if ( conf->gvcf ) gvcf_update_header(conf->gvcf, conf->bcf_hdr); int nsmpl; const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); for (i=0; i<nsmpl; i++) bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]); bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); conf->bca = bcf_call_init(-1., conf->min_baseQ); conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; conf->bca->min_frac = conf->min_frac; conf->bca->min_support = conf->min_support; conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4); conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32 if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<nsmpl; i++) { conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES; conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; } } } // init mpileup conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); if ( (double)conf->max_depth * conf->nfiles > 1<<20) fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl); bam_mplp_set_maxcnt(conf->iter, conf->max_depth); conf->max_indel_depth = conf->max_indel_depth * nsmpl; conf->bcf_rec = bcf_init1(); bam_mplp_constructor(conf->iter, pileup_constructor); // Run mpileup for multiple regions if ( nregs ) { int ireg = 0; do { // first region is already positioned if ( ireg++ > 0 ) { conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); for (i=0; i<conf->nfiles; i++) { hts_itr_destroy(conf->mplp_data[i]->iter); conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } bam_mplp_reset(conf->iter); } } mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end); } while ( regitr_loop(conf->reg_itr) ); } else mpileup_reg(conf,0,0); flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); // clean up free(conf->bc.tmp.s); bcf_destroy1(conf->bcf_rec); if (conf->bcf_fp) { hts_close(conf->bcf_fp); bcf_hdr_destroy(conf->bcf_hdr); bcf_call_destroy(conf->bca); free(conf->bc.PL); free(conf->bc.DP4); free(conf->bc.ADR); free(conf->bc.ADF); free(conf->bc.fmt_arr); free(conf->bcr); } if ( conf->gvcf ) gvcf_destroy(conf->gvcf); free(conf->buf.s); for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]); free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp); bam_mplp_destroy(conf->iter); bam_hdr_destroy(hdr); for (i = 0; i < conf->nfiles; ++i) { if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx); sam_close(conf->mplp_data[i]->fp); if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter); free(conf->mplp_data[i]); } if ( conf->reg_itr ) regitr_destroy(conf->reg_itr); free(conf->mplp_data); free(conf->plp); free(conf->n_plp); free(mp_ref.ref[0]); free(mp_ref.ref[1]); return 0; }
static void init_data(args_t *args) { bcf1_t *line = NULL; // With phased concat, the chunks overlap and come in the right order. To // avoid opening all files at once, store start positions to recognise need // for the next one. This way we can keep only two open chunks at once. if ( args->phased_concat ) { args->start_pos = (int*) malloc(sizeof(int)*args->nfnames); line = bcf_init(); } kstring_t str = {0,0,0}; int i, prev_chrid = -1; for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); int j; for (j=0; j<bcf_hdr_nsamples(hdr); j++) if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) ) error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); if ( args->phased_concat ) { int ret = bcf_read(fp, hdr, line); if ( ret!=0 ) args->start_pos[i] = -2; // empty file else { int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line)); args->start_pos[i] = chrid==prev_chrid ? line->pos : -1; prev_chrid = chrid; } } bcf_hdr_destroy(hdr); hts_close(fp); } free(str.s); if ( line ) bcf_destroy(line); args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int)); if ( args->phased_concat ) { bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">"); bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">"); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); bcf_hdr_write(args->out_fh, args->out_hdr); if ( args->allow_overlaps ) { args->files = bcf_sr_init(); args->files->require_index = 1; if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->remove_dups ) { if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS; else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The -D string \"%s\" not recognised.\n", args->remove_dups); } for (i=0; i<args->nfnames; i++) if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum)); } else if ( args->phased_concat ) { // Remove empty files from the list int nok = 0; while (1) { while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++; if ( nok==args->nfnames ) break; i = nok; while ( i<args->nfnames && args->start_pos[i]==-2 ) i++; if ( i==args->nfnames ) break; int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp; char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str; } for (i=nok; i<args->nfnames; i++) free(args->fnames[i]); args->nfnames = nok; for (i=1; i<args->nfnames; i++) if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] ) error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]); args->prev_chr = -1; args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->files = bcf_sr_init(); args->files->require_index = 1; args->ifname = 0; } }
//last is from abc.h void print_bcf_header(htsFile *fp,bcf_hdr_t *hdr,argStruct *args,kstring_t &buf,const bam_hdr_t *bhdr){ assert(args); ksprintf(&buf, "##angsdVersion=%s+htslib-%s\n",angsd_version(),hts_version()); bcf_hdr_append(hdr, buf.s); buf.l = 0; ksprintf(&buf, "##angsdCommand="); for (int i=1; i<args->argc; i++) ksprintf(&buf, " %s", args->argv[i]); kputc('\n', &buf); bcf_hdr_append(hdr, buf.s); buf.l=0; if(args->ref){ buf.l = 0; ksprintf(&buf, "##reference=file://%s\n", args->ref); bcf_hdr_append(hdr,buf.s); } if (args->anc){ buf.l = 0; ksprintf(&buf, "##ancestral=file://%s\n", args->anc); bcf_hdr_append(hdr,buf.s); } // Translate BAM @SQ tags to BCF ##contig tags // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (int i=0; i<bhdr->n_targets; i++){ buf.l = 0; ksprintf(&buf, "##contig=<ID=%s,length=%d>", bhdr->target_name[i], bhdr->target_len[i]); bcf_hdr_append(hdr, buf.s); } buf.l = 0; bcf_hdr_append(hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of raw reads supporting an indel\">"); bcf_hdr_append(hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of raw reads supporting an indel\">"); bcf_hdr_append(hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">"); bcf_hdr_append(hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(hdr,"##INFO=<ID=AF,Number=A,Type=Float,Description=\"Minor Allele Frequency\">"); bcf_hdr_append(hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); bcf_hdr_append(hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">"); bcf_hdr_append(hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">"); bcf_hdr_append(hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=GL,Number=G,Type=Float,Description=\"scaled Genotype Likelihoods (loglikeratios to the most likely (in log10))\">"); //fprintf(stderr,"samples from sm:%d\n",args->sm->n); for(int i=0;i<args->sm->n;i++){ bcf_hdr_add_sample(hdr, args->sm->smpl[i]); } bcf_hdr_add_sample(hdr, NULL); // to update internal structures if ( bcf_hdr_write(fp, hdr)!=0 ) fprintf(stderr,"Failed to write bcf\n"); buf.l=0; }
int beds_database_add(struct beds_options *opts, const char *fname, char *columns) { if ( opts->n_files == opts->m_files ) { opts->m_files = opts->m_files == 0 ? 2 : opts->m_files +2; opts->files = (struct beds_anno_file*)realloc(opts->files, opts->m_files*sizeof(struct beds_anno_file)); } struct beds_anno_file *file = &opts->files[opts->n_files]; memset(file, 0, sizeof(struct beds_anno_file)); file->id = opts->n_files; file->fname = strdup(fname); file->fp = hts_open(fname, "r"); if (file->fp == NULL) error("Failed to open %s : %s", fname, strerror(errno)); // int n; file->idx = tbx_index_load(fname); if ( file->idx == NULL) error("Failed to load index of %s.", fname); opts->n_files++; file->last_id = -1; file->last_start = -1; file->last_end = -1; kstring_t string = KSTRING_INIT; int no_columns = 0; int i; if ( columns == NULL && file->no_such_chrom == 0) { warnings("No columns string specified for %s. Will annotate all tags in this data.", fname); file->no_such_chrom = 1; no_columns = 1; } else { int *splits = NULL; kputs(columns, &string); int nfields; splits = ksplit(&string, ',', &nfields); file->m_cols = nfields; file->cols = (struct anno_col*)malloc(sizeof(struct anno_col) * file->m_cols); for ( i = 0; i < nfields; ++i ) { char *ss = string.s + splits[i]; struct anno_col *col = &file->cols[file->n_cols]; col->icol = i; col->replace = REPLACE_MISSING; if (*ss == '+') { col->replace = REPLACE_MISSING; ss++; } else if ( *ss == '-' ) { col->replace = REPLACE_EXISTING; ss++; } if (ss[0] == '\0') continue; if ( strncmp(ss, "INFO/", 5) == 0) ss += 5; col->hdr_key = strdup(ss); col->icol = -1; // debug_print("%s, %d", col->hdr_key, file->n_cols); file->n_cols++; } string.l = 0; } while (1) { string.l =0; if ( hts_getline(file->fp, KS_SEP_LINE, &string) < 0 ) break; // only accept header line in the beginning for file if ( string.s[0] != '#' ) break; if ( strncmp(string.s, "##INFO=", 7) == 0) { char *ss = string.s + 11; char *se = ss; while (se && *se != ',') se++; struct anno_col *col = NULL; // if no column string specified, init all header lines if ( no_columns ) { if ( file->n_cols == file->m_cols ) { file->m_cols = file->m_cols == 0 ? 2 : file->m_cols + 2; file->cols = (struct anno_col *) realloc(file->cols, file->m_cols*sizeof(struct anno_col)); } col = &file->cols[file->n_cols++]; col->icol = -1; col->hdr_key = strndup(ss, se-ss+1); col->hdr_key[se-ss] = '\0'; } else { for ( i = 0; i < file->n_cols; ++i ) { if ( strncmp(file->cols[i].hdr_key, ss, se-ss) == 0) break; } // if header line is not set in the column string, skip if ( i == file->n_cols ) continue; col = &file->cols[i]; } // specify setter functions here col->setter.bed = beds_setter_info_string; bcf_hdr_append(opts->hdr_out, string.s); bcf_hdr_sync(opts->hdr_out); int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID,col->hdr_key); assert ( bcf_hdr_idinfo_exists(opts->hdr_out, BCF_HL_INFO, hdr_id) ); } string.l = 0; // set column number for each col if ( strncasecmp(string.s, "#chr", 4) == 0) { int nfields; int *splits = ksplit(&string, '\t', &nfields); if (nfields < 4) { fprintf(stderr, "[error] Bad header of bed database : %s. n_fields : %d, %s", fname, nfields, string.s); fprintf(stderr, "[notice] this error usually happened because the header line is seperated by spaces but not tab!"); exit(1); } int k; for ( k = 3; k < nfields; ++k ) { char *ss = string.s + splits[k]; for (i = 0; i < file->n_cols; ++i ) { struct anno_col *col = &file->cols[i]; if ( strcmp(col->hdr_key, ss) == 0) break; } // if name line specify more names than column string or header, skip if ( i == file->n_cols ) continue; struct anno_col *col = &file->cols[i]; col->icol = k; } } } for ( i = 0; i < file->n_cols; ++i ) { struct anno_col *col = &file->cols[i]; if ( col->hdr_key && col->icol == -1 ) error("No column %s found in bed database : %s", col->hdr_key, fname); int hdr_id = bcf_hdr_id2int(opts->hdr_out, BCF_DT_ID, col->hdr_key); assert(hdr_id>-1); col->number = bcf_hdr_id2length(opts->hdr_out, BCF_HL_INFO, hdr_id); if ( col->number == BCF_VL_A || col->number == BCF_VL_R || col->number == BCF_VL_G) error("Only support fixed INFO number for bed database. %s", col->hdr_key); col->ifile = file->id; } if ( string.m ) free(string.s); if ( opts->beds_is_inited == 0 ) opts->beds_is_inited = 1; return 0; }
static void init_data(args_t *args) { args->aux.srs = bcf_sr_init(); // Open files for input and output, initialize structures if ( args->targets ) { if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) error("Failed to read the targets: %s\n", args->targets); if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) { args->aux.srs->targets->missed_reg_handler = print_missed_line; args->aux.srs->targets->missed_reg_data = args; } } if ( args->regions ) { if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) error("Failed to read the targets: %s\n", args->regions); } int i; if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open: %s\n", args->bcf_fname); if ( args->nsamples && args->nsamples != bcf_hdr_nsamples(args->aux.srs->readers[0].header) ) { args->samples_map = (int *) malloc(sizeof(int)*args->nsamples); args->aux.hdr = bcf_hdr_subset(args->aux.srs->readers[0].header, args->nsamples, args->samples, args->samples_map); for (i=0; i<args->nsamples; i++) if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]); if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n"); } else { args->aux.hdr = bcf_hdr_dup(args->aux.srs->readers[0].header); for (i=0; i<args->nsamples; i++) if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) error("No such sample: %s\n", args->samples[i]); } // Reorder ploidy and family indexes to match mpileup's output and exclude samples which are not available if ( args->aux.ploidy ) { for (i=0; i<args->aux.nfams; i++) { int j; for (j=0; j<3; j++) { int k = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[ args->aux.fams[i].sample[j] ]); if ( k<0 ) error("No such sample: %s\n", args->samples[ args->aux.fams[i].sample[j] ]); args->aux.fams[i].sample[j] = k; } } uint8_t *ploidy = (uint8_t*) calloc(bcf_hdr_nsamples(args->aux.hdr), 1); for (i=0; i<args->nsamples; i++) // i index in -s sample list { int j = bcf_hdr_id2int(args->aux.hdr, BCF_DT_SAMPLE, args->samples[i]); // j index in the output VCF / subset VCF if ( j<0 ) { fprintf(stderr,"Warning: no such sample: \"%s\"\n", args->samples[i]); continue; } ploidy[j] = args->aux.ploidy[i]; } args->nsamples = bcf_hdr_nsamples(args->aux.hdr); for (i=0; i<args->nsamples; i++) assert( ploidy[i]==0 || ploidy[i]==1 || ploidy[i]==2 ); free(args->aux.ploidy); args->aux.ploidy = ploidy; } args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->flag & CF_QCALL ) return; if ( args->flag & CF_MCALL ) mcall_init(&args->aux); if ( args->flag & CF_CCALL ) ccall_init(&args->aux); if ( args->flag&CF_GVCF ) { bcf_hdr_append(args->aux.hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); args->gvcf.rid = -1; args->gvcf.line = bcf_init1(); args->gvcf.gt = (int32_t*) malloc(2*sizeof(int32_t)*bcf_hdr_nsamples(args->aux.hdr)); for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) { args->gvcf.gt[2*i+0] = bcf_gt_unphased(0); args->gvcf.gt[2*i+1] = bcf_gt_unphased(0); } } bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS"); bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); bcf_hdr_write(args->out_fh, args->aux.hdr); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); }
static void init_data(args_t *args) { args->aux.srs = bcf_sr_init(); // Open files for input and output, initialize structures if ( args->targets ) { if ( bcf_sr_set_targets(args->aux.srs, args->targets, args->targets_is_file, args->aux.flag&CALL_CONSTR_ALLELES ? 3 : 0)<0 ) error("Failed to read the targets: %s\n", args->targets); if ( args->aux.flag&CALL_CONSTR_ALLELES && args->flag&CF_INS_MISSED ) { args->aux.srs->targets->missed_reg_handler = print_missed_line; args->aux.srs->targets->missed_reg_data = args; } } if ( args->regions ) { if ( bcf_sr_set_regions(args->aux.srs, args->regions, args->regions_is_file)<0 ) error("Failed to read the targets: %s\n", args->regions); } if ( !bcf_sr_add_reader(args->aux.srs, args->bcf_fname) ) error("Failed to open %s: %s\n", args->bcf_fname,bcf_sr_strerror(args->aux.srs->errnum)); args->aux.hdr = bcf_sr_get_header(args->aux.srs,0); int i; if ( args->samples_fname ) { set_samples(args, args->samples_fname, args->samples_is_file); if ( args->aux.flag&CALL_CONSTR_TRIO ) { if ( 3*args->aux.nfams!=args->nsamples ) error("Expected only trios in %s, sorry!\n", args->samples_fname); fprintf(stderr,"Detected %d samples in %d trio families\n", args->nsamples,args->aux.nfams); } args->nsex = ploidy_nsex(args->ploidy); args->sex2ploidy = (int*) calloc(args->nsex,sizeof(int)); args->sex2ploidy_prev = (int*) calloc(args->nsex,sizeof(int)); args->aux.ploidy = (uint8_t*) malloc(args->nsamples); for (i=0; i<args->nsamples; i++) args->aux.ploidy[i] = 2; for (i=0; i<args->nsex; i++) args->sex2ploidy_prev[i] = 2; } if ( args->samples_map ) { args->aux.hdr = bcf_hdr_subset(bcf_sr_get_header(args->aux.srs,0), args->nsamples, args->samples, args->samples_map); if ( !args->aux.hdr ) error("Error occurred while subsetting samples\n"); for (i=0; i<args->nsamples; i++) if ( args->samples_map[i]<0 ) error("No such sample: %s\n", args->samples[i]); if ( !bcf_hdr_nsamples(args->aux.hdr) ) error("No matching sample found\n"); } else { args->aux.hdr = bcf_hdr_dup(bcf_sr_get_header(args->aux.srs,0)); for (i=0; i<args->nsamples; i++) if ( bcf_hdr_id2int(args->aux.hdr,BCF_DT_SAMPLE,args->samples[i])<0 ) error("No such sample: %s\n", args->samples[i]); } args->out_fh = hts_open(args->output_fname, hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->flag & CF_QCALL ) return; if ( args->flag & CF_MCALL ) mcall_init(&args->aux); if ( args->flag & CF_CCALL ) ccall_init(&args->aux); if ( args->flag&CF_GVCF ) { bcf_hdr_append(args->aux.hdr,"##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">"); args->gvcf.rid = -1; args->gvcf.line = bcf_init1(); args->gvcf.gt = (int32_t*) malloc(2*sizeof(int32_t)*bcf_hdr_nsamples(args->aux.hdr)); for (i=0; i<bcf_hdr_nsamples(args->aux.hdr); i++) { args->gvcf.gt[2*i+0] = bcf_gt_unphased(0); args->gvcf.gt[2*i+1] = bcf_gt_unphased(0); } } bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "QS"); bcf_hdr_remove(args->aux.hdr, BCF_HL_INFO, "I16"); bcf_hdr_append_version(args->aux.hdr, args->argc, args->argv, "bcftools_call"); bcf_hdr_write(args->out_fh, args->aux.hdr); if ( args->flag&CF_INS_MISSED ) init_missed_line(args); }