static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst) { bcf_hrec_t *src_hrec, *dst_hrec, *tmp; bcf_hdr_t *out = bcf_hdr_init("r"); int i; for (i=0; i<dst->nhrec; i++) { // first insert lines which do not code BCF ids, their order does not matter dst_hrec = dst->hrec[i]; if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG ) continue; bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec)); } for (i=0; i<src->nhrec; i++) { // now transfer header lines which define BCF ids src_hrec = src->hrec[i]; if ( src_hrec->type==BCF_HL_FLT || src_hrec->type==BCF_HL_INFO || src_hrec->type==BCF_HL_FMT || src_hrec->type== BCF_HL_CTG ) { int j = bcf_hrec_find_key(src_hrec, "ID"); dst_hrec = bcf_hdr_get_hrec(dst, src_hrec->type, "ID", src_hrec->vals[j], NULL); if ( !dst_hrec ) continue; tmp = bcf_hrec_dup(dst_hrec); j = bcf_hrec_find_key(src_hrec, "IDX"); if ( j>=0 ) { j = atoi(src_hrec->vals[j]); hrec_add_idx(tmp, j); } bcf_hdr_add_hrec(out, tmp); } } bcf_hdr_sync(out); for (i=0; i<dst->nhrec; i++) { // finally add new structured fields dst_hrec = dst->hrec[i]; if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG ) { int j = bcf_hrec_find_key(dst_hrec, "ID"); tmp = bcf_hdr_get_hrec(out, dst_hrec->type, "ID", dst_hrec->vals[j], NULL); if ( !tmp ) bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec)); } } for (i=0; i<dst->n[BCF_DT_SAMPLE]; i++) bcf_hdr_add_sample(out, dst->samples[i]); bcf_hdr_add_sample(out, NULL); bcf_hdr_destroy(dst); return out; }
void bcf_hdr_merge(bcf_hdr_t *hw, const bcf_hdr_t *_hr, const char *clash_prefix) { bcf_hdr_t *hr = (bcf_hdr_t*)_hr; // header lines int i, nw_ori = hw->nhrec; for (i=0; i<hr->nhrec; i++) { if ( hr->hrec[i]->type==BCF_HL_GEN && hr->hrec[i]->value ) { int j; for (j=0; j<nw_ori; j++) { if ( hw->hrec[j]->type!=BCF_HL_GEN ) continue; if ( !strcmp(hr->hrec[i]->key,hw->hrec[j]->key) && !strcmp(hr->hrec[i]->value,hw->hrec[j]->value) ) break; } if ( j>=nw_ori ) bcf_hdr_add_hrec(hw, bcf_hrec_dup(hr->hrec[i])); } else { bcf_hrec_t *rec = bcf_hdr_get_hrec(hw, hr->hrec[i]->type, hr->hrec[i]->vals[0]); if ( !rec ) bcf_hdr_add_hrec(hw, bcf_hrec_dup(hr->hrec[i])); } } // samples for (i=0; i<bcf_hdr_nsamples(hr); i++) { char *name = hr->samples[i]; if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) { // there is a sample with the same name int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1; name = (char*) malloc(sizeof(char)*(len+1)); sprintf(name,"%s:%s",clash_prefix,hr->samples[i]); bcf_hdr_add_sample(hw,name); free(name); } else bcf_hdr_add_sample(hw,name); } }
void bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) { int i = 0; const char *p, *q; vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; // add samples d = (vdict_t*)h->dict[BCF_DT_SAMPLE]; for (p = q = str;; ++q) { if (*q != '\t' && *q != 0) continue; if (++i > 9) { char *s; s = (char*)malloc(q - p + 1); strncpy(s, p, q - p); s[q - p] = 0; bcf_hdr_add_sample(h,s); } if (*q == 0) break; p = q + 1; } }
static int mpileup(mplp_conf_t *conf) { if (conf->nfiles == 0) { fprintf(stderr,"[%s] no input file/data given\n", __func__); exit(EXIT_FAILURE); } mplp_ref_t mp_ref = MPLP_REF_INIT; conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t)); conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*)); conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*)); conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int)); // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index // must be kept in the memory for the whole time which can be a problem with many bams. // Therefore if none or only one region is requested, we initialize the bam iterator as // before and free the index. Only when multiple regions are queried, we keep the index. int nregs = 0; if ( conf->reg_fname ) { if ( conf->reg_is_file ) { conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL); if ( !conf->reg ) { fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } else { conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) { fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } nregs = regidx_nregs(conf->reg); conf->reg_itr = regitr_init(conf->reg); regitr_loop(conf->reg_itr); // region iterator now positioned at the first region } // read the header of each file in the list and initialize data // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least! bam_hdr_t *hdr = NULL; // header of first file in input list int i; for (i = 0; i < conf->nfiles; ++i) { bam_hdr_t *h_tmp; conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t)); conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb"); if ( !conf->mplp_data[i]->fp ) { fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); exit(EXIT_FAILURE); } if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) { fprintf(stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); exit(EXIT_FAILURE); } conf->mplp_data[i]->conf = conf; conf->mplp_data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(conf->mplp_data[i]->fp); if ( !h_tmp ) { fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]); if ( conf->mplp_data[i]->bam_id<0 ) { // no usable readgroups in this bam, it can be skipped sam_close(conf->mplp_data[i]->fp); free(conf->mplp_data[i]); bam_hdr_destroy(h_tmp); free(conf->files[i]); if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1)); conf->nfiles--; i--; continue; } if (conf->reg) { hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]); if (idx == NULL) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s); if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } if ( nregs==1 ) // no need to keep the index in memory hts_idx_destroy(idx); else conf->mplp_data[i]->idx = idx; } if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */ else { // FIXME: check consistency between h and h_tmp bam_hdr_destroy(h_tmp); // we store only the first file's header; it's (alleged to be) // compatible with the i-th file's target_name lookup needs conf->mplp_data[i]->h = hdr; } } // allocate data storage proportionate to number of samples being studied sm->n bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n); conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); if (conf->bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); } if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads); // BCF header creation conf->bcf_hdr = bcf_hdr_init("w"); conf->buf.l = 0; if (conf->record_cmd_line) { ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version()); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); conf->buf.l = 0; ksprintf(&conf->buf, "##bcftoolsCommand=mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]); kputc('\n', &conf->buf); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } if (conf->fai_fname) { conf->buf.l = 0; ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } // Translate BAM @SQ tags to BCF ##contig tags // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<hdr->n_targets; i++) { conf->buf.l = 0; ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } conf->buf.l = 0; bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); if ( conf->fmt_flag&B2B_FMT_AD ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">"); if ( conf->fmt_flag&B2B_FMT_ADF ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_FMT_ADR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">"); if ( conf->fmt_flag&B2B_INFO_AD ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">"); if ( conf->fmt_flag&B2B_INFO_ADF ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_INFO_ADR ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">"); if ( conf->gvcf ) gvcf_update_header(conf->gvcf, conf->bcf_hdr); int nsmpl; const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); for (i=0; i<nsmpl; i++) bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]); bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); conf->bca = bcf_call_init(-1., conf->min_baseQ); conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; conf->bca->min_frac = conf->min_frac; conf->bca->min_support = conf->min_support; conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4); conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32 if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<nsmpl; i++) { conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES; conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; } } } // init mpileup conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); if ( (double)conf->max_depth * conf->nfiles > 1<<20) fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl); bam_mplp_set_maxcnt(conf->iter, conf->max_depth); conf->max_indel_depth = conf->max_indel_depth * nsmpl; conf->bcf_rec = bcf_init1(); bam_mplp_constructor(conf->iter, pileup_constructor); // Run mpileup for multiple regions if ( nregs ) { int ireg = 0; do { // first region is already positioned if ( ireg++ > 0 ) { conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); for (i=0; i<conf->nfiles; i++) { hts_itr_destroy(conf->mplp_data[i]->iter); conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } bam_mplp_reset(conf->iter); } } mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end); } while ( regitr_loop(conf->reg_itr) ); } else mpileup_reg(conf,0,0); flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); // clean up free(conf->bc.tmp.s); bcf_destroy1(conf->bcf_rec); if (conf->bcf_fp) { hts_close(conf->bcf_fp); bcf_hdr_destroy(conf->bcf_hdr); bcf_call_destroy(conf->bca); free(conf->bc.PL); free(conf->bc.DP4); free(conf->bc.ADR); free(conf->bc.ADF); free(conf->bc.fmt_arr); free(conf->bcr); } if ( conf->gvcf ) gvcf_destroy(conf->gvcf); free(conf->buf.s); for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]); free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp); bam_mplp_destroy(conf->iter); bam_hdr_destroy(hdr); for (i = 0; i < conf->nfiles; ++i) { if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx); sam_close(conf->mplp_data[i]->fp); if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter); free(conf->mplp_data[i]); } if ( conf->reg_itr ) regitr_destroy(conf->reg_itr); free(conf->mplp_data); free(conf->plp); free(conf->n_plp); free(mp_ref.ref[0]); free(mp_ref.ref[1]); return 0; }
/* * Performs pileup * @param conf configuration for this pileup * @param n number of files specified in fn * @param fn filenames */ static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_hdr_t *h = NULL; /* header of first file in input list */ char *ref; void *rghash = NULL; FILE *pileup_fp = NULL; bcf_callaux_t *bca = NULL; bcf_callret1_t *bcr = NULL; bcf_call_t bc; htsFile *bcf_fp = NULL; bcf_hdr_t *bcf_hdr = NULL; bam_sample_t *sm = NULL; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(mplp_aux_t*)); plp = calloc(n, sizeof(bam_pileup1_t*)); n_plp = calloc(n, sizeof(int)); sm = bam_smpl_init(); if (n == 0) { fprintf(stderr,"[%s] no input file/data given\n", __func__); exit(1); } // read the header of each file in the list and initialize data for (i = 0; i < n; ++i) { bam_hdr_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = sam_open(fn[i], "rb"); if ( !data[i]->fp ) { fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(1); } hts_set_fai_filename(data[i]->fp, conf->fai_fname); data[i]->conf = conf; h_tmp = sam_hdr_read(data[i]->fp); if ( !h_tmp ) { fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(1); } data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(1); } if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg); exit(1); } if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; hts_idx_destroy(idx); } if (i == 0) h = h_tmp; /* save the header of first file in list */ else { // FIXME: to check consistency bam_hdr_destroy(h_tmp); } } // allocate data storage proportionate to number of samples being studied sm->n gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_BCF) { const char *mode; if ( conf->flag & MPLP_VCF ) mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz"; // uncompressed VCF or compressed VCF else mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb"; // uncompressed BCF or compressed BCF bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); if (bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(1); } bcf_hdr = bcf_hdr_init("w"); kstring_t str = {0,0,0}; ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version()); bcf_hdr_append(bcf_hdr, str.s); str.l = 0; ksprintf(&str, "##samtoolsCommand=samtools mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&str, " %s", conf->argv[i]); kputc('\n', &str); bcf_hdr_append(bcf_hdr, str.s); if (conf->fai_fname) { str.l = 0; ksprintf(&str, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(bcf_hdr, str.s); } // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<h->n_targets; i++) { str.l = 0; ksprintf(&str, "##contig=<ID=%s,length=%d>", h->target_name[i], h->target_len[i]); bcf_hdr_append(bcf_hdr, str.s); } free(str.s); bcf_hdr_append(bcf_hdr,"##ALT=<ID=X,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); for (i=0; i<sm->n; i++) bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); bcf_hdr_add_sample(bcf_hdr, NULL); bcf_hdr_write(bcf_fp, bcf_hdr); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; bc.bcf_hdr = bcf_hdr; bc.n = sm->n; bc.PL = malloc(15 * sm->n * sizeof(*bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4); bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample bc.DPR = malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<sm->n; i++) bcr[i].DPR = bc.DPR + (i+1)*B2B_MAX_ALLELES; } } } else { pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout; if (pileup_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); exit(1); } } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; // begin pileup iter = bam_mplp_init(n, mplp_func, (void**)data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_BCF) { int total_depth, _ref0, ref16; for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = seq_nt16_table[_ref0]; bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bc.tid = tid; bc.pos = pos; bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); } } } else { fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j, cnt; for (j = cnt = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; } fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); } else { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_MAPQ) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if ( c < conf->min_baseQ ) continue; c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_POS) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putc(',', pileup_fp); fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putc('\n', pileup_fp); } } // clean up free(bc.tmp.s); bcf_destroy1(bcf_rec); if (bcf_fp) { hts_close(bcf_fp); bcf_hdr_destroy(bcf_hdr); bcf_call_destroy(bca); free(bc.PL); free(bc.DP4); free(bc.DPR); free(bc.fmt_arr); free(bcr); } if (pileup_fp && conf->output_fname) fclose(pileup_fp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bam_mplp_destroy(iter); bam_hdr_destroy(h); for (i = 0; i < n; ++i) { sam_close(data[i]->fp); if (data[i]->iter) hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return ret; }
//last is from abc.h void print_bcf_header(htsFile *fp,bcf_hdr_t *hdr,argStruct *args,kstring_t &buf,const bam_hdr_t *bhdr){ assert(args); ksprintf(&buf, "##angsdVersion=%s+htslib-%s\n",angsd_version(),hts_version()); bcf_hdr_append(hdr, buf.s); buf.l = 0; ksprintf(&buf, "##angsdCommand="); for (int i=1; i<args->argc; i++) ksprintf(&buf, " %s", args->argv[i]); kputc('\n', &buf); bcf_hdr_append(hdr, buf.s); buf.l=0; if(args->ref){ buf.l = 0; ksprintf(&buf, "##reference=file://%s\n", args->ref); bcf_hdr_append(hdr,buf.s); } if (args->anc){ buf.l = 0; ksprintf(&buf, "##ancestral=file://%s\n", args->anc); bcf_hdr_append(hdr,buf.s); } // Translate BAM @SQ tags to BCF ##contig tags // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (int i=0; i<bhdr->n_targets; i++){ buf.l = 0; ksprintf(&buf, "##contig=<ID=%s,length=%d>", bhdr->target_name[i], bhdr->target_len[i]); bcf_hdr_append(hdr, buf.s); } buf.l = 0; bcf_hdr_append(hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of raw reads supporting an indel\">"); bcf_hdr_append(hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of raw reads supporting an indel\">"); bcf_hdr_append(hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=NS,Number=1,Type=Integer,Description=\"Number of Samples With Data\">"); bcf_hdr_append(hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(hdr,"##INFO=<ID=AF,Number=A,Type=Float,Description=\"Minor Allele Frequency\">"); bcf_hdr_append(hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); bcf_hdr_append(hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">"); bcf_hdr_append(hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">"); bcf_hdr_append(hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); bcf_hdr_append(hdr,"##FORMAT=<ID=GL,Number=G,Type=Float,Description=\"scaled Genotype Likelihoods (loglikeratios to the most likely (in log10))\">"); //fprintf(stderr,"samples from sm:%d\n",args->sm->n); for(int i=0;i<args->sm->n;i++){ bcf_hdr_add_sample(hdr, args->sm->smpl[i]); } bcf_hdr_add_sample(hdr, NULL); // to update internal structures if ( bcf_hdr_write(fp, hdr)!=0 ) fprintf(stderr,"Failed to write bcf\n"); buf.l=0; }
int main(int argc, char **argv) { char *fname = argc>1 ? argv[1] : "/dev/null"; htsFile *fp = hts_open(fname, "w"); bcf_hdr_t *hdr1, *hdr2; hdr1 = bcf_hdr_init("w"); hdr2 = bcf_hdr_init("w"); // Add two shared and two private annotations bcf_hdr_append(hdr1, "##contig=<ID=1>"); bcf_hdr_append(hdr1, "##contig=<ID=2>"); bcf_hdr_append(hdr2, "##contig=<ID=2>"); bcf_hdr_append(hdr2, "##contig=<ID=1>"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT1,Description=\"Filter 1\">"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT2,Description=\"Filter 2\">"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT3,Description=\"Filter 3\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT4,Description=\"Filter 4\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT3,Description=\"Filter 3\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT2,Description=\"Filter 2\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF1,Number=.,Type=Integer,Description=\"Info 1\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF4,Number=.,Type=Integer,Description=\"Info 4\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT1,Number=.,Type=Integer,Description=\"FMT 1\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT4,Number=.,Type=Integer,Description=\"FMT 4\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">"); bcf_hdr_add_sample(hdr1,"SMPL1"); bcf_hdr_add_sample(hdr1,"SMPL2"); bcf_hdr_add_sample(hdr2,"SMPL1"); bcf_hdr_add_sample(hdr2,"SMPL2"); bcf_hdr_sync(hdr1); bcf_hdr_sync(hdr2); hdr2 = bcf_hdr_merge(hdr2,hdr1); bcf_hdr_sync(hdr2); if ( bcf_hdr_write(fp, hdr2)!=0 ) error("Failed to write to %s\n", fname); bcf1_t *rec = bcf_init1(); rec->rid = bcf_hdr_name2id(hdr1, "1"); rec->pos = 0; bcf_update_alleles_str(hdr1, rec, "G,A"); int32_t tmpi[3]; tmpi[0] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT1"); tmpi[1] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"); tmpi[2] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT3"); bcf_update_filter(hdr1, rec, tmpi, 3); tmpi[0] = 1; bcf_update_info_int32(hdr1, rec, "INF1", tmpi, 1); tmpi[0] = 2; bcf_update_info_int32(hdr1, rec, "INF2", tmpi, 1); tmpi[0] = 3; bcf_update_info_int32(hdr1, rec, "INF3", tmpi, 1); tmpi[0] = tmpi[1] = 1; bcf_update_format_int32(hdr1, rec, "FMT1", tmpi, 2); tmpi[0] = tmpi[1] = 2; bcf_update_format_int32(hdr1, rec, "FMT2", tmpi, 2); tmpi[0] = tmpi[1] = 3; bcf_update_format_int32(hdr1, rec, "FMT3", tmpi, 2); bcf_remove_filter(hdr1, rec, bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"), 0); bcf_update_info_int32(hdr1, rec, "INF2", NULL, 0); bcf_update_format_int32(hdr1, rec, "FMT2", NULL, 0); bcf_translate(hdr2, hdr1, rec); if ( bcf_write(fp, hdr2, rec)!=0 ) error("Faild to write to %s\n", fname); // Clean bcf_destroy1(rec); bcf_hdr_destroy(hdr1); bcf_hdr_destroy(hdr2); int ret; if ( (ret=hts_close(fp)) ) { fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); exit(ret); } return 0; }
int ctx_vcfcov(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL, *out_type = NULL; uint32_t max_allele_len = 0, max_gt_vars = 0; char *ref_path = NULL; bool low_mem = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; size_t i; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break; case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break; case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break; case 'M': cmd_check(!low_mem, cmd); low_mem = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)"); if(optind+2 > argc) cmd_print_usage("Require VCF and graph files"); if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN; if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS; status("[vcfcov] max allele length: %u; max number of variants: %u", max_allele_len, max_gt_vars); // open ref // index fasta with: samtools faidx ref.fa faidx_t *fai = fai_load(ref_path); if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path); // Open input VCF file const char *vcf_path = argv[optind++]; htsFile *vcffh = hts_open(vcf_path, "r"); if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path); bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh); if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path); // Test we can close and reopen files if(low_mem) { if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); } // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, low_mem ? -1 : (int64_t)ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *outfh = hts_open(out_path, modes_htslib[mode]); status("[vcfcov] Output format: %s", hsmodes_htslib[mode]); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_COVGS); // // Set up tag names // // *R => ref, *A => alt sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size); // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK> // - K29_kcov is empirical kmer coverage // - K29_nkmers is the number of kmers in the sample // - mean_read_length is the mean read length in bases char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20]; sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size); sprintf(sample_rlk_tag, "mean_read_length"); // // Load kmers if we are using --low-mem // VcfCovStats st; memset(&st, 0, sizeof(st)); VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag, .kcov_alt_tag = kcov_alt_tag, .max_allele_len = max_allele_len, .max_gt_vars = max_gt_vars, .load_kmers_only = false}; if(low_mem) { status("[vcfcov] Loading kmers from VCF+ref"); prefs.load_kmers_only = true; vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai, NULL, &prefs, &st, &db_graph); // Close files hts_close(vcffh); bcf_hdr_destroy(vcfhdr); // Re-open files if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); prefs.load_kmers_only = false; } // // Load graphs // GraphLoadingStats gstats; memset(&gstats, 0, sizeof(gstats)); GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.must_exist_in_graph = low_mem; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &gstats); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // // Set up VCF header / graph matchup // size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t)); // Add samples to vcf header bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr); bcf_hrec_t *hrec; int sid; char hdrstr[200]; for(i = 0; i < db_graph.num_of_cols; i++) { char *sname = db_graph.ginfo[i].sample_name.b; if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) { bcf_hdr_add_sample(outhdr, sname); sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname); } samplehdrids[i] = sid; // Add SAMPLE field hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE"); if(hrec == NULL) { sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname, sample_kcov_tag, gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0, sample_nk_tag, gstats.nkmers[i], sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length); bcf_hdr_append(outhdr, hdrstr); } else { // mean kcovg sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr); // num kmers sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr); // mean read length in kmers sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length); vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr); } status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]); } // Add genotype format fields // One field per alternative allele sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_ref_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_alt_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); bcf_hdr_set_version(outhdr, "VCFv4.2"); // Add command string to header vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd()); if(bcf_hdr_write(outfh, outhdr) != 0) die("Cannot write header to: %s", futil_outpath_str(out_path)); status("[vcfcov] Reading %s and adding coverage", vcf_path); // Reset stats and get coverage memset(&st, 0, sizeof(st)); vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai, samplehdrids, &prefs, &st, &db_graph); // Print statistics char ns0[50], ns1[50]; status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0)); status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0)); status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0)); status("[vcfcov] ALTs used: %s / %s (%.2f%%)", ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0); status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len, ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0); status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)", max_gt_vars, db_graph.kmer_size, ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0); status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)", ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0); status("[vcfcov] Saved to: %s\n", out_path); ctx_free(samplehdrids); graph_loading_stats_destroy(&gstats); bcf_hdr_destroy(vcfhdr); bcf_hdr_destroy(outhdr); hts_close(vcffh); hts_close(outfh); fai_destroy(fai); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }