void bcf_sr_destroy(readers_t *files) { if ( !files->nreaders ) return; int i; for (i=0; i<files->nreaders; i++) { reader_t *reader = &files->readers[i]; if ( reader->tbx ) tbx_destroy(reader->tbx); if ( reader->bcf ) hts_idx_destroy(reader->bcf); bcf_hdr_destroy(reader->header); hts_close(reader->file); if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; j<reader->mbuffer; j++) bcf_destroy1(reader->buffer[j]); free(reader->buffer); if ( reader->samples ) free(reader->samples); } free(files->readers); free(files->seqs); for (i=0; i<files->n_smpl; i++) free(files->samples[i]); free(files->samples); if (files->targets) { if (files->targets->itr) tbx_itr_destroy(files->targets->itr); tbx_destroy(files->targets->tbx); if (files->targets->line.m) free(files->targets->line.s); hts_close(files->targets->file); free(files->targets->seq_names); free(files->targets); } if ( files->tmps.m ) free(files->tmps.s); free(files); }
int main_vcfview(int argc, char *argv[]) { int c, clevel = -1, in_type = FT_BCF, out_type = FT_VCF; char *fname_out = NULL, moder[8], modew[8]; while ((c = getopt(argc, argv, "l:bvo:n:z?hu")) >= 0) { switch (c) { case 'o': switch (optarg[0]) { case 'b': out_type = FT_BCF_GZ; break; case 'u': out_type = FT_BCF; break; case 'z': out_type = FT_VCF_GZ; break; case 'v': out_type = FT_VCF; break; default: error("The output type \"%s\" not recognised\n", optarg); } break; case 'l': clevel = atoi(optarg); out_type |= FT_GZ; break; case 'v': in_type = FT_VCF; break; case 'b': out_type = FT_BCF_GZ; break; case 'u': out_type = FT_BCF; break; case 'z': out_type = FT_VCF_GZ; break; case 'n': fname_out = optarg; break; case '?': case 'h': usage(); return 1; break; } } if (argc!=optind+1) { usage(); return 1; } // Init reader strcpy(moder, "r"); if ( (!strcmp("-",argv[optind]) && (in_type & FT_BCF)) || (hts_file_type(argv[optind]) & FT_BCF)) strcat(moder, "b"); htsFile *fp_in = hts_open(argv[optind], moder, NULL); if ( !fp_in ) error("Fail to open: %s\n", argv[optind]); bcf_hdr_t *hdr = vcf_hdr_read(fp_in); if ( !hdr ) error("Fail to read VCF/BCF header: %s\n", argv[optind]); bcf1_t *rec = bcf_init1(); // Init writer strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (out_type & FT_GZ) strcat(modew,"z"); if (out_type & FT_BCF) strcat(modew, "b"); if (out_type == FT_BCF) strcat(modew, "u"); // uncompressed BCF output htsFile *fp_out = hts_open(fname_out ? fname_out : "-", modew, NULL); vcf_hdr_write(fp_out, hdr); while ( vcf_read1(fp_in, hdr, rec) >= 0) vcf_write1(fp_out, hdr, rec); bcf_destroy1(rec); bcf_hdr_destroy(hdr); hts_close(fp_in); hts_close(fp_out); return 0; }
static void destroy_data(args_t *args) { bcf_sr_destroy(args->files); int i; for (i=0; i<args->vcf_rbuf.m; i++) if ( args->vcf_buf[i] ) bcf_destroy1(args->vcf_buf[i]); free(args->vcf_buf); free(args->fa_buf.s); if ( args->mask ) regidx_destroy(args->mask); if ( args->chain_fname ) if ( fclose(args->fp_chain) ) error("Close failed: %s\n", args->chain_fname); if ( fclose(args->fp_out) ) error("Close failed: %s\n", args->output_fname); }
static void destroy_data(args_t *args) { if ( args->rbuf_lines ) { int i; for (i=0; i<args->rbuf.m; i++) if ( args->rbuf_lines[i] ) bcf_destroy1(args->rbuf_lines[i]); free(args->rbuf_lines); } if ( args->filter ) filter_destroy(args->filter); free(args->tmpi); }
static void bcf_sr_destroy1(bcf_sr_t *reader) { if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); bcf_hdr_destroy(reader->header); hts_close(reader->file); if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; j<reader->mbuffer; j++) bcf_destroy1(reader->buffer[j]); free(reader->buffer); free(reader->samples); free(reader->filter_ids); }
int main_getalt(int argc, char *argv[]) { int c; char *fn; BGZF *fp; bcf1_t *b; bcf_hdr_t *h; kstring_t s = {0,0,0}; while ((c = getopt(argc, argv, "")) >= 0) { } if (argc - optind == 0) { fprintf(stderr, "Usage: bgt getalt <bgt-base>\n"); return 1; } fn = (char*)calloc(strlen(argv[optind]) + 5, 1); sprintf(fn, "%s.bcf", argv[optind]); fp = bgzf_open(fn, "r"); free(fn); assert(fp); h = bcf_hdr_read(fp); b = bcf_init1(); while (bcf_read1(fp, b) >= 0) { char *ref, *alt; int l_ref, l_alt, i, min_l; bcf_get_ref_alt1(b, &l_ref, &ref, &l_alt, &alt); min_l = l_ref < l_alt? l_ref : l_alt; for (i = 0; i < min_l && ref[i] == alt[i]; ++i); s.l = 0; kputs(h->id[BCF_DT_CTG][b->rid].key, &s); kputc(':', &s); kputw(b->pos + 1 + i, &s); kputc(':', &s); kputw(b->rlen - i, &s); kputc(':', &s); kputsn(alt + i, l_alt - i, &s); puts(s.s); } bcf_destroy1(b); bcf_hdr_destroy(h); bgzf_close(fp); free(s.s); return 0; }
//write out variants to out file int flush(int pos,htsFile *outf,bcf_hdr_t *hdr_out) { int n = 0; while(_buf.size()>0 && (pos - _buf.front()->pos) > _w ) { // cerr << _last_pos<<"<="<<_buf.front()->pos<<endl; assert(_last_pos<=_buf.front()->pos); if( _last_pos!=_buf.front()->pos ) _seen.clear(); // bcf1_t *tmp = _buf.front(); //capitalises ref/alt. this should now be fixed upstream. // int i=0; // while(tmp->d.allele[0][i]) { // tmp->d.allele[0][i]=toupper(tmp->d.allele[0][i]); // i++; // } // i=0; // while(tmp->d.allele[1][i]) { // tmp->d.allele[1][i]=toupper(tmp->d.allele[1][i]); // i++; // } // bcf_update_alleles(hdr_out,tmp,(const char**)tmp->d.allele,tmp->n_allele); string variant=(string)_buf.front()->d.allele[0] +"."+ (string)_buf.front()->d.allele[1]; if(_seen.count(variant)) { _ndup++; } else { _seen.insert(variant); bcf_write1(outf, hdr_out, _buf.front()); } _last_pos=_buf.front()->pos; bcf_destroy1( _buf.front() ); _buf.pop_front(); n++; } return(n); }
static int mpileup(mplp_conf_t *conf) { if (conf->nfiles == 0) { fprintf(stderr,"[%s] no input file/data given\n", __func__); exit(EXIT_FAILURE); } mplp_ref_t mp_ref = MPLP_REF_INIT; conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t)); conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*)); conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*)); conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int)); // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index // must be kept in the memory for the whole time which can be a problem with many bams. // Therefore if none or only one region is requested, we initialize the bam iterator as // before and free the index. Only when multiple regions are queried, we keep the index. int nregs = 0; if ( conf->reg_fname ) { if ( conf->reg_is_file ) { conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL); if ( !conf->reg ) { fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } else { conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) { fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } nregs = regidx_nregs(conf->reg); conf->reg_itr = regitr_init(conf->reg); regitr_loop(conf->reg_itr); // region iterator now positioned at the first region } // read the header of each file in the list and initialize data // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least! bam_hdr_t *hdr = NULL; // header of first file in input list int i; for (i = 0; i < conf->nfiles; ++i) { bam_hdr_t *h_tmp; conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t)); conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb"); if ( !conf->mplp_data[i]->fp ) { fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); exit(EXIT_FAILURE); } if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) { fprintf(stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); exit(EXIT_FAILURE); } conf->mplp_data[i]->conf = conf; conf->mplp_data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(conf->mplp_data[i]->fp); if ( !h_tmp ) { fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]); if ( conf->mplp_data[i]->bam_id<0 ) { // no usable readgroups in this bam, it can be skipped sam_close(conf->mplp_data[i]->fp); free(conf->mplp_data[i]); bam_hdr_destroy(h_tmp); free(conf->files[i]); if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1)); conf->nfiles--; i--; continue; } if (conf->reg) { hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]); if (idx == NULL) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s); if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } if ( nregs==1 ) // no need to keep the index in memory hts_idx_destroy(idx); else conf->mplp_data[i]->idx = idx; } if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */ else { // FIXME: check consistency between h and h_tmp bam_hdr_destroy(h_tmp); // we store only the first file's header; it's (alleged to be) // compatible with the i-th file's target_name lookup needs conf->mplp_data[i]->h = hdr; } } // allocate data storage proportionate to number of samples being studied sm->n bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n); conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); if (conf->bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); } if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads); // BCF header creation conf->bcf_hdr = bcf_hdr_init("w"); conf->buf.l = 0; if (conf->record_cmd_line) { ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version()); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); conf->buf.l = 0; ksprintf(&conf->buf, "##bcftoolsCommand=mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]); kputc('\n', &conf->buf); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } if (conf->fai_fname) { conf->buf.l = 0; ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } // Translate BAM @SQ tags to BCF ##contig tags // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<hdr->n_targets; i++) { conf->buf.l = 0; ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } conf->buf.l = 0; bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); if ( conf->fmt_flag&B2B_FMT_AD ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">"); if ( conf->fmt_flag&B2B_FMT_ADF ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_FMT_ADR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">"); if ( conf->fmt_flag&B2B_INFO_AD ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">"); if ( conf->fmt_flag&B2B_INFO_ADF ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_INFO_ADR ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">"); if ( conf->gvcf ) gvcf_update_header(conf->gvcf, conf->bcf_hdr); int nsmpl; const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); for (i=0; i<nsmpl; i++) bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]); bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); conf->bca = bcf_call_init(-1., conf->min_baseQ); conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; conf->bca->min_frac = conf->min_frac; conf->bca->min_support = conf->min_support; conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4); conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32 if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<nsmpl; i++) { conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES; conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; } } } // init mpileup conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); if ( (double)conf->max_depth * conf->nfiles > 1<<20) fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl); bam_mplp_set_maxcnt(conf->iter, conf->max_depth); conf->max_indel_depth = conf->max_indel_depth * nsmpl; conf->bcf_rec = bcf_init1(); bam_mplp_constructor(conf->iter, pileup_constructor); // Run mpileup for multiple regions if ( nregs ) { int ireg = 0; do { // first region is already positioned if ( ireg++ > 0 ) { conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); for (i=0; i<conf->nfiles; i++) { hts_itr_destroy(conf->mplp_data[i]->iter); conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } bam_mplp_reset(conf->iter); } } mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end); } while ( regitr_loop(conf->reg_itr) ); } else mpileup_reg(conf,0,0); flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); // clean up free(conf->bc.tmp.s); bcf_destroy1(conf->bcf_rec); if (conf->bcf_fp) { hts_close(conf->bcf_fp); bcf_hdr_destroy(conf->bcf_hdr); bcf_call_destroy(conf->bca); free(conf->bc.PL); free(conf->bc.DP4); free(conf->bc.ADR); free(conf->bc.ADF); free(conf->bc.fmt_arr); free(conf->bcr); } if ( conf->gvcf ) gvcf_destroy(conf->gvcf); free(conf->buf.s); for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]); free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp); bam_mplp_destroy(conf->iter); bam_hdr_destroy(hdr); for (i = 0; i < conf->nfiles; ++i) { if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx); sam_close(conf->mplp_data[i]->fp); if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter); free(conf->mplp_data[i]); } if ( conf->reg_itr ) regitr_destroy(conf->reg_itr); free(conf->mplp_data); free(conf->plp); free(conf->n_plp); free(mp_ref.ref[0]); free(mp_ref.ref[1]); return 0; }
int vcf_index_stats(char *fname, int stats) { char *fn_out = NULL; FILE *out; out = fn_out ? fopen(fn_out, "w") : stdout; const char **seq; int i, nseq; tbx_t *tbx = NULL; hts_idx_t *idx = NULL; htsFile *fp = hts_open(fname,"r"); if ( !fp ) { fprintf(stderr,"Could not read %s\n", fname); return 1; } bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) { fprintf(stderr,"Could not read the header: %s\n", fname); return 1; } if ( hts_get_format(fp)->format==vcf ) { tbx = tbx_index_load(fname); if ( !tbx ) { fprintf(stderr,"Could not load TBI index: %s\n", fname); return 1; } } else if ( hts_get_format(fp)->format==bcf ) { idx = bcf_index_load(fname); if ( !idx ) { fprintf(stderr,"Could not load CSI index: %s\n", fname); return 1; } } else { fprintf(stderr,"Could not detect the file type as VCF or BCF: %s\n", fname); return 1; } seq = tbx ? tbx_seqnames(tbx, &nseq) : bcf_index_seqnames(idx, hdr, &nseq); uint64_t sum = 0; for (i=0; i<nseq; i++) { uint64_t records, v; hts_idx_get_stat(tbx ? tbx->idx : idx, i, &records, &v); sum+=records; if (stats&2 || !records) continue; bcf_hrec_t *hrec = bcf_hdr_get_hrec(hdr, BCF_HL_CTG, "ID", seq[i], NULL); int hkey = hrec ? bcf_hrec_find_key(hrec, "length") : -1; fprintf(out,"%s\t%s\t%" PRIu64 "\n", seq[i], hkey<0?".":hrec->vals[hkey], records); } if (!sum) { // No counts found. // Is this because index version has no stored count data, or no records? bcf1_t *rec = bcf_init1(); if (bcf_read1(fp, hdr, rec) >= 0) { fprintf(stderr,"%s index of %s does not contain any count metadata. Please re-index with a newer version of bcftools or tabix.\n", tbx ? "TBI" : "CSI", fname); return 1; } bcf_destroy1(rec); } if (stats&2) fprintf(out, "%" PRIu64 "\n", sum); free(seq); fclose(out); hts_close(fp); bcf_hdr_destroy(hdr); if (tbx) tbx_destroy(tbx); if (idx) hts_idx_destroy(idx); return 0; }
int ingest1(const char *input,const char *output,char *ref,bool exit_on_mismatch=true) { cerr << "Input: " << input << "\tOutput: "<<output<<endl; kstream_t *ks; kstring_t str = {0,0,0}; gzFile fp = gzopen(input, "r"); VarBuffer vbuf(1000); int prev_rid = -1; if(fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } char *out_fname = (char *)malloc(strlen(output)+5); strcpy(out_fname,output); strcat(out_fname,".tmp"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("depth: %s\n",out_fname); gzFile depth_fp = gzopen(out_fname, "wb1"); strcpy(out_fname,output); strcat(out_fname,".bcf"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("variants: %s\n",out_fname); htsFile *variant_fp=hts_open(out_fname,"wb1"); if(variant_fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } ks = ks_init(fp); htsFile *hfp=hts_open(input, "r"); bcf_hdr_t *hdr_in = bcf_hdr_read(hfp); hts_close(hfp); //this is a hack to fix gvcfs where AD is incorrectly defined in the header. (vcf4.2 does not technically allow Number=R) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"AD"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)\">") == 0); //this is a hack to fix broken gvcfs where GQ is incorrectly labelled as float (v4.3 spec says it should be integer) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"GQ"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">") == 0); // bcf_hdr_t *hdr_out=hdr_in; bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr_in); remove_hdr_lines(hdr_out,BCF_HL_INFO); remove_hdr_lines(hdr_out,BCF_HL_FLT); bcf_hdr_sync(hdr_out); //here we add FORMAT/PF. which is the pass filter flag for alts. assert( bcf_hdr_append(hdr_out,"##FORMAT=<ID=PF,Number=A,Type=Integer,Description=\"variant was PASS filter in original sample gvcf\">") == 0); args_t *norm_args = init_vcfnorm(hdr_out,ref); norm_args->check_ref |= CHECK_REF_WARN; bcf1_t *bcf_rec = bcf_init(); bcf_hdr_write(variant_fp, hdr_out); kstring_t work1 = {0,0,0}; int buf[5]; ks_tokaux_t aux; int ndec=0; int ref_len,alt_len; while( ks_getuntil(ks, '\n', &str, 0) >=0) { // fprintf(stderr,"%s\n",str.s); if(str.s[0]!='#') { char *ptr = kstrtok(str.s,"\t",&aux);//chrom ptr = kstrtok(NULL,NULL,&aux);//pos work1.l=0; kputsn(str.s,ptr-str.s-1, &work1); buf[0] = bcf_hdr_name2id(hdr_in, work1.s); assert( buf[0]>=0); buf[1]=atoi(ptr)-1; ptr = kstrtok(NULL,NULL,&aux);//ID ptr = kstrtok(NULL,NULL,&aux);//REF ref_len=0; while(ptr[ref_len]!='\t') ref_len++; ptr = kstrtok(NULL,NULL,&aux);//ALT bool is_variant=false; alt_len=0; while(ptr[alt_len]!='\t') alt_len++; if(ptr[0]!='.') is_variant=true; char * QUAL_ptr = kstrtok(NULL, NULL, &aux); assert (QUAL_ptr != NULL); for(int i=0;i<2;i++) ptr = kstrtok(NULL,NULL,&aux);// gets us to INFO //find END if it is there char *end_ptr=strstr(ptr,"END=") ; if(end_ptr!=NULL) buf[2]=atoi(end_ptr+4)-1; else buf[2]=buf[1]+alt_len-1; ptr = kstrtok(NULL,NULL,&aux);//FORMAT //find index of DP (if present) //if not present, dont output anything (indels ignored) char *DP_ptr = find_format(ptr,"DP"); int GQX = 0; int QUAL = 0; // AH: change code to use the minimum of GQ and QUAL fields if // GQX is not defined. See here: // https://support.basespace.illumina.com/knowledgebase/articles/144844-vcf-file // "GQXGenotype quality. GQX is the minimum of the GQ value // and the QUAL column. In general, these are similar values; // taking the minimum makes GQX the more conservative measure of // genotype quality." if(DP_ptr!=NULL) { buf[3]=atoi(DP_ptr); char *GQX_ptr = find_format(ptr,"GQX"); if (GQX_ptr == NULL) { GQX_ptr = find_format(ptr,"GQ"); GQX = atoi(GQX_ptr); if (QUAL_ptr[0] != '.') { QUAL = atoi(QUAL_ptr); if (QUAL < GQX) GQX = QUAL; } } else { GQX = atoi(GQX_ptr); } //trying to reduce entropy on GQ to get better compression performance. //1. rounds down to nearest 10. //2. sets gq to min(gq,100). buf[4]=GQX/10; buf[4]*=10; if(buf[4]>100) buf[4]=100; // printf("%d\t%d\t%d\t%d\t%d\n",buf[0],buf[1],buf[2],buf[3],buf[4]); if(gzwrite(depth_fp,buf,5*sizeof(int))!=(5*sizeof(int))) die("ERROR: problem writing "+(string)out_fname+".tmp"); } if(is_variant) {//wass this a variant? if so write it out to the bcf norm_args->ntotal++; vcf_parse(&str,hdr_in,bcf_rec); // cerr<<bcf_rec->rid<<":"<<bcf_rec->pos<<endl; if(prev_rid!=bcf_rec->rid) vbuf.flush(variant_fp,hdr_out); else vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); prev_rid=bcf_rec->rid; int32_t pass = bcf_has_filter(hdr_in, bcf_rec, "."); bcf_update_format_int32(hdr_out,bcf_rec,"PF",&pass,1); bcf_update_filter(hdr_out,bcf_rec,NULL,0); if(bcf_rec->n_allele>2) {//split multi-allelics (using vcfnorm.c from bcftools1.3 norm_args->nsplit++; split_multiallelic_to_biallelics(norm_args,bcf_rec ); for(int i=0;i<norm_args->ntmp_lines;i++){ remove_info(norm_args->tmp_lines[i]); if(realign(norm_args,norm_args->tmp_lines[i]) != ERR_REF_MISMATCH) ndec+=decompose(norm_args->tmp_lines[i],hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } } else { remove_info(bcf_rec); if( realign(norm_args,bcf_rec) != ERR_REF_MISMATCH) ndec+=decompose(bcf_rec,hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); } } } vbuf.flush(variant_fp,hdr_out); bcf_hdr_destroy(hdr_in); bcf_hdr_destroy(hdr_out); bcf_destroy1(bcf_rec); ks_destroy(ks); gzclose(fp); gzclose(depth_fp); free(str.s); free(work1.s); hts_close(variant_fp); destroy_data(norm_args); fprintf(stderr,"Variant lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", norm_args->ntotal,norm_args->nsplit,norm_args->nchanged,norm_args->nskipped); fprintf(stderr,"Decomposed %d MNPs\n", ndec); fprintf(stderr,"Indexing %s\n",out_fname); bcf_index_build(out_fname, BCF_LIDX_SHIFT); free(out_fname); return 0; }
int main_vcfview(int argc, char *argv[]) { int i, c, clevel = -1, flag = 0, n_samples = -1, *imap = 0, excl_snp = 0, excl_indel = 0; char *fn_ref = 0, *fn_out = 0, moder[8], **samples = 0; bcf_hdr_t *h, *hsub = 0; htsFile *in; bcf1_t *b; while ((c = getopt(argc, argv, "l:bSt:o:T:s:GNI")) >= 0) { switch (c) { case 'l': clevel = atoi(optarg); flag |= 2; break; case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'G': n_samples = 0; break; case 't': fn_ref = optarg; flag |= 1; break; case 'o': fn_out = optarg; break; case 's': samples = hts_readlines(optarg, &n_samples); break; case 'N': excl_snp = 1; break; case 'I': excl_indel = 1; break; } } if (argc == optind) { fprintf(stderr, "\nUsage: vcfview [options] <in.bcf>|<in.vcf>|<in.vcf.gz>\n\n"); fprintf(stderr, "Options: -b output in BCF\n"); fprintf(stderr, " -S input is VCF\n"); fprintf(stderr, " -o FILE output file name [stdout]\n"); fprintf(stderr, " -l INT compression level [%d]\n", clevel); fprintf(stderr, " -t FILE list of reference names and lengths [null]\n"); fprintf(stderr, " -s FILE/STR list of samples (STR if started with ':'; FILE otherwise) [null]\n"); fprintf(stderr, " -G drop individual genotype information\n"); fprintf(stderr, " -N exclude SNPs\n"); fprintf(stderr, " -I exclude INDELs\n"); fprintf(stderr, "\n"); return 1; } strcpy(moder, "r"); if ((flag&1) == 0 && !(file_type(argv[optind])&(IS_VCF|IS_VCF_GZ))) strcat(moder, "b"); in = hts_open(argv[optind], moder, fn_ref); h = vcf_hdr_read(in); if (h == 0) { fprintf(stderr, "[E::%s] fail to read the VCF/BCF2 header\n", __func__); hts_close(in); return 1; } if (n_samples >= 0) { if (n_samples) imap = (int*)malloc(n_samples * sizeof(int)); hsub = bcf_hdr_subset(h, n_samples, samples, imap); } b = bcf_init1(); if ((flag&4) == 0) { // VCF/BCF output htsFile *out; char modew[8]; strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&2) strcat(modew, "b"); out = hts_open(fn_out? fn_out : "-", modew, 0); vcf_hdr_write(out, hsub? hsub : h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region hts_idx_t *idx; if ((idx = bcf_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BCF index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while (bcf_itr_next((BGZF*)in->fp, iter, b) >= 0) { if (excl_snp && bcf_is_snp(b)) continue; if (excl_indel && !bcf_is_snp(b)) continue; if (n_samples >= 0) { bcf_subset(h, b, n_samples, imap); vcf_write1(out, hsub, b); } else vcf_write1(out, h, b); } hts_itr_destroy(iter); } hts_idx_destroy(idx); } else { while (vcf_read1(in, h, b) >= 0) { if (excl_snp && bcf_is_snp(b)) continue; if (excl_indel && !bcf_is_snp(b)) continue; if (n_samples >= 0) { bcf_subset(h, b, n_samples, imap); vcf_write1(out, hsub, b); } else vcf_write1(out, h, b); } } hts_close(out); } bcf_destroy1(b); if (n_samples > 0) { for (i = 0; i < n_samples; ++i) free(samples[i]); free(samples); bcf_hdr_destroy(hsub); free(imap); } bcf_hdr_destroy(h); hts_close(in); return 0; }
int main(int argc, char **argv) { int i, n; static struct option const long_opts[] = { {"out", required_argument, NULL, 1}, {"report", required_argument, NULL, 2}, {"dotasref", no_argument, NULL, 3}, {"help", no_argument, NULL, 0}, {"version", no_argument, NULL, 4}, {"export_uncov", no_argument, NULL, 5} }; bool help = FALSE; bool report_version = FALSE; while ((n = getopt_long(argc, argv, "1:2:304", long_opts, NULL)) >= 0) { switch (n) { case 1 : outfile = strdup(optarg); break; case 2 : report = strdup(optarg); break; case 3 : dotasref = TRUE; break; case 0 : help = TRUE; break; case 4 : report_version = TRUE; break; case 5 : export_uncover = TRUE; break; default : return 1; } if ( help ) return usage(); if ( report_version ) return show_version(); } n = argc - optind; if ( n > 1 ) errabort("only accept one input vcf"); if ( export_uncover == TRUE && outfile == FALSE) { warnings("export uncove region only used with option --out"); export_uncover = FALSE; } char * input; if ( n == 0 ) input = strdup("-"); else input = strdup(argv[optind]); htsFile * fp = read_vcf_file(input); enum htsExactFormat fmt = hts_get_format(fp)->format; if ( fmt != vcf && fmt != bcf ) errabort("This is not a VCF/BCF file : %s", input); bcf_hdr_t * hdr = bcf_hdr_read(fp); int n_samples = bcf_hdr_nsamples(hdr); if ( n_samples != 2 ) errabort("the input VCF/BCF file must contain only two samples! %d", n_samples); LOG("Using sample %s as ref ...", hdr->samples[0]); LOG("Using sample %s as test ...", hdr->samples[1]); uint32_t matrix[4][4] = { {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0} }; bcf1_t * v = bcf_init1(); kstring_t str = { 0, 0, 0 }; uint32_t line = 0; htsFile *out = NULL; if ( outfile && !check_filename(outfile) ) out = hts_open(outfile, mode); if ( out != NULL ) bcf_hdr_write(out, hdr); while ( bcf_read1(fp, hdr, v) >= 0 ) { bcf_unpack(v, BCF_UN_STR|BCF_UN_FMT); int k; str.l = 0; int tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GT"); if ( !bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, tag_id) ) warnings("There is no 'GT' in the header!"); for ( i = 0; i < v->n_fmt; ++i ) if ( v->d.fmt[i].id == tag_id ) break; if ( i == v->n_fmt ) { vcf_format1(hdr, v, &str); LOG("There is no tag GT in this line : %s", str.s); continue; } corr_t xy[2] = { {-1, -2, -2}, {-1, -2, -2} }; bcf_fmt_t * fmt = &v->d.fmt[i]; for ( i = 0; i < 2; ++i ) { int corr = i; if ( fmt == NULL ) { if ( dotasref == TRUE ) xy[corr].alt = ALT_IS_REF; else xy[corr].alt = ALT_IS_UNC; continue; } int last = -2; uint8_t *d = (uint8_t*)((char*)fmt->p + fmt->size*i); for ( k = 0; k < fmt->n && d[k] != (uint8_t)bcf_int8_vector_end; ++k ) { int curr = d[k]>>1; if ( last != curr ) { if ( curr ) { if ( last == -2 ) xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF; else xy[corr].alt = ALT_IS_HET; } else { xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC; } } else { if ( curr ) { xy[corr].alt = curr > 1 ? ALT_IS_HOM : ALT_IS_REF; } else { xy[corr].alt = dotasref == TRUE ? ALT_IS_REF : ALT_IS_UNC; } } if (last == -2 ) { xy[corr].min = xy[corr].max = curr; } else { if ( curr < xy[corr].min ) xy[corr].min = curr; else if ( curr > xy[corr].max ) xy[corr].max = curr; } last = curr; } } matrix[xy[0].alt][xy[1].alt]++; if ( xy[0].alt != xy[1].alt && out != NULL) { if ( xy[0].alt == ALT_IS_UNC || xy[1].alt == ALT_IS_UNC ) { if ( export_uncover == TRUE ) { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } else { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } if ( xy[0].alt == ALT_IS_HET && xy[1].alt == ALT_IS_HET && (xy[0].min != xy[1].min || xy[0].max != xy[1].max ) ) { bias++; matrix[ALT_IS_HET][ALT_IS_HET]--; if ( out != NULL ) { str.l = 0; vcf_format1(hdr, v, &str); vcf_write(out, hdr, v); } } line++; } if ( out ) hts_close(out); if ( str.m ) free(str.s); write_report(matrix, hdr); bcf_hdr_destroy(hdr); free(input); bcf_destroy1(v); if ( outfile ) free(outfile); if ( report ) free(report); if ( hts_close(fp) ) warnings("hts_close returned non-zero status: %s", input); return 0; }
/* * Performs pileup * @param conf configuration for this pileup * @param n number of files specified in fn * @param fn filenames */ static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_hdr_t *h = NULL; /* header of first file in input list */ char *ref; void *rghash = NULL; FILE *pileup_fp = NULL; bcf_callaux_t *bca = NULL; bcf_callret1_t *bcr = NULL; bcf_call_t bc; htsFile *bcf_fp = NULL; bcf_hdr_t *bcf_hdr = NULL; bam_sample_t *sm = NULL; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(mplp_aux_t*)); plp = calloc(n, sizeof(bam_pileup1_t*)); n_plp = calloc(n, sizeof(int)); sm = bam_smpl_init(); if (n == 0) { fprintf(stderr,"[%s] no input file/data given\n", __func__); exit(1); } // read the header of each file in the list and initialize data for (i = 0; i < n; ++i) { bam_hdr_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = sam_open(fn[i], "rb"); if ( !data[i]->fp ) { fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, fn[i], strerror(errno)); exit(1); } hts_set_fai_filename(data[i]->fp, conf->fai_fname); data[i]->conf = conf; h_tmp = sam_hdr_read(data[i]->fp); if ( !h_tmp ) { fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, fn[i]); exit(1); } data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); // Collect read group IDs with PL (platform) listed in pl_list (note: fragile, strstr search) rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { hts_idx_t *idx = sam_index_load(data[i]->fp, fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, fn[i]); exit(1); } if ( (data[i]->iter=sam_itr_querys(idx, data[i]->h, conf->reg)) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, conf->reg); exit(1); } if (i == 0) tid0 = data[i]->iter->tid, beg0 = data[i]->iter->beg, end0 = data[i]->iter->end; hts_idx_destroy(idx); } if (i == 0) h = h_tmp; /* save the header of first file in list */ else { // FIXME: to check consistency bam_hdr_destroy(h_tmp); } } // allocate data storage proportionate to number of samples being studied sm->n gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_BCF) { const char *mode; if ( conf->flag & MPLP_VCF ) mode = (conf->flag&MPLP_NO_COMP)? "wu" : "wz"; // uncompressed VCF or compressed VCF else mode = (conf->flag&MPLP_NO_COMP)? "wub" : "wb"; // uncompressed BCF or compressed BCF bcf_fp = bcf_open(conf->output_fname? conf->output_fname : "-", mode); if (bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(1); } bcf_hdr = bcf_hdr_init("w"); kstring_t str = {0,0,0}; ksprintf(&str, "##samtoolsVersion=%s+htslib-%s\n",samtools_version(),hts_version()); bcf_hdr_append(bcf_hdr, str.s); str.l = 0; ksprintf(&str, "##samtoolsCommand=samtools mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&str, " %s", conf->argv[i]); kputc('\n', &str); bcf_hdr_append(bcf_hdr, str.s); if (conf->fai_fname) { str.l = 0; ksprintf(&str, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(bcf_hdr, str.s); } // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<h->n_targets; i++) { str.l = 0; ksprintf(&str, "##contig=<ID=%s,length=%d>", h->target_name[i], h->target_len[i]); bcf_hdr_append(bcf_hdr, str.s); } free(str.s); bcf_hdr_append(bcf_hdr,"##ALT=<ID=X,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); for (i=0; i<sm->n; i++) bcf_hdr_add_sample(bcf_hdr, sm->smpl[i]); bcf_hdr_add_sample(bcf_hdr, NULL); bcf_hdr_write(bcf_fp, bcf_hdr); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; bc.bcf_hdr = bcf_hdr; bc.n = sm->n; bc.PL = malloc(15 * sm->n * sizeof(*bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); bc.DP4 = malloc(sm->n * sizeof(int32_t) * 4); bc.fmt_arr = malloc(sm->n * sizeof(float)); // all fmt_flag fields if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample bc.DPR = malloc((sm->n+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<sm->n; i++) bcr[i].DPR = bc.DPR + (i+1)*B2B_MAX_ALLELES; } } } else { pileup_fp = conf->output_fname? fopen(conf->output_fname, "w") : stdout; if (pileup_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname, strerror(errno)); exit(1); } } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; // begin pileup iter = bam_mplp_init(n, mplp_func, (void**)data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(iter); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); bcf1_t *bcf_rec = bcf_init1(); int ret; while ( (ret=bam_mplp_auto(iter, &tid, &pos, n_plp, plp)) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_BCF) { int total_depth, _ref0, ref16; for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = seq_nt16_table[_ref0]; bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bc.tid = tid; bc.pos = pos; bcf_call_combine(gplp.n, bcr, bca, ref16, &bc); bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, 0, 0); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); // call indels; todo: subsampling with total_depth>max_indel_depth instead of ignoring? if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { bcf_callaux_clean(bca, &bc); for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, bca, -1, &bc) >= 0) { bcf_clear1(bcf_rec); bcf_call2bcf(&bc, bcf_rec, bcr, conf->fmt_flag, bca, ref); bcf_write1(bcf_fp, bcf_hdr, bcf_rec); } } } else { fprintf(pileup_fp, "%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j, cnt; for (j = cnt = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) ++cnt; } fprintf(pileup_fp, "\t%d\t", cnt); if (n_plp[i] == 0) { fputs("*\t*", pileup_fp); if (conf->flag & MPLP_PRINT_MAPQ) fputs("\t*", pileup_fp); if (conf->flag & MPLP_PRINT_POS) fputs("\t*", pileup_fp); } else { for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; if (bam_get_qual(p->b)[p->qpos] >= conf->min_baseQ) pileup_seq(pileup_fp, plp[i] + j, pos, ref_len, ref); } putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if (c >= conf->min_baseQ) { c = c + 33 < 126? c + 33 : 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_MAPQ) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam_get_qual(p->b)[p->qpos]; if ( c < conf->min_baseQ ) continue; c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putc(c, pileup_fp); } } if (conf->flag & MPLP_PRINT_POS) { putc('\t', pileup_fp); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putc(',', pileup_fp); fprintf(pileup_fp, "%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putc('\n', pileup_fp); } } // clean up free(bc.tmp.s); bcf_destroy1(bcf_rec); if (bcf_fp) { hts_close(bcf_fp); bcf_hdr_destroy(bcf_hdr); bcf_call_destroy(bca); free(bc.PL); free(bc.DP4); free(bc.DPR); free(bc.fmt_arr); free(bcr); } if (pileup_fp && conf->output_fname) fclose(pileup_fp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bam_mplp_destroy(iter); bam_hdr_destroy(h); for (i = 0; i < n; ++i) { sam_close(data[i]->fp); if (data[i]->iter) hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return ret; }
int main(int argc, char **argv) { char *fname = argc>1 ? argv[1] : "/dev/null"; htsFile *fp = hts_open(fname, "w"); bcf_hdr_t *hdr1, *hdr2; hdr1 = bcf_hdr_init("w"); hdr2 = bcf_hdr_init("w"); // Add two shared and two private annotations bcf_hdr_append(hdr1, "##contig=<ID=1>"); bcf_hdr_append(hdr1, "##contig=<ID=2>"); bcf_hdr_append(hdr2, "##contig=<ID=2>"); bcf_hdr_append(hdr2, "##contig=<ID=1>"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT1,Description=\"Filter 1\">"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT2,Description=\"Filter 2\">"); bcf_hdr_append(hdr1, "##FILTER=<ID=FLT3,Description=\"Filter 3\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT4,Description=\"Filter 4\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT3,Description=\"Filter 3\">"); bcf_hdr_append(hdr2, "##FILTER=<ID=FLT2,Description=\"Filter 2\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF1,Number=.,Type=Integer,Description=\"Info 1\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">"); bcf_hdr_append(hdr1, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF4,Number=.,Type=Integer,Description=\"Info 4\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF3,Number=.,Type=Integer,Description=\"Info 3\">"); bcf_hdr_append(hdr2, "##INFO=<ID=INF2,Number=.,Type=Integer,Description=\"Info 2\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT1,Number=.,Type=Integer,Description=\"FMT 1\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">"); bcf_hdr_append(hdr1, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT4,Number=.,Type=Integer,Description=\"FMT 4\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT3,Number=.,Type=Integer,Description=\"FMT 3\">"); bcf_hdr_append(hdr2, "##FORMAT=<ID=FMT2,Number=.,Type=Integer,Description=\"FMT 2\">"); bcf_hdr_add_sample(hdr1,"SMPL1"); bcf_hdr_add_sample(hdr1,"SMPL2"); bcf_hdr_add_sample(hdr2,"SMPL1"); bcf_hdr_add_sample(hdr2,"SMPL2"); bcf_hdr_sync(hdr1); bcf_hdr_sync(hdr2); hdr2 = bcf_hdr_merge(hdr2,hdr1); bcf_hdr_sync(hdr2); if ( bcf_hdr_write(fp, hdr2)!=0 ) error("Failed to write to %s\n", fname); bcf1_t *rec = bcf_init1(); rec->rid = bcf_hdr_name2id(hdr1, "1"); rec->pos = 0; bcf_update_alleles_str(hdr1, rec, "G,A"); int32_t tmpi[3]; tmpi[0] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT1"); tmpi[1] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"); tmpi[2] = bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT3"); bcf_update_filter(hdr1, rec, tmpi, 3); tmpi[0] = 1; bcf_update_info_int32(hdr1, rec, "INF1", tmpi, 1); tmpi[0] = 2; bcf_update_info_int32(hdr1, rec, "INF2", tmpi, 1); tmpi[0] = 3; bcf_update_info_int32(hdr1, rec, "INF3", tmpi, 1); tmpi[0] = tmpi[1] = 1; bcf_update_format_int32(hdr1, rec, "FMT1", tmpi, 2); tmpi[0] = tmpi[1] = 2; bcf_update_format_int32(hdr1, rec, "FMT2", tmpi, 2); tmpi[0] = tmpi[1] = 3; bcf_update_format_int32(hdr1, rec, "FMT3", tmpi, 2); bcf_remove_filter(hdr1, rec, bcf_hdr_id2int(hdr1, BCF_DT_ID, "FLT2"), 0); bcf_update_info_int32(hdr1, rec, "INF2", NULL, 0); bcf_update_format_int32(hdr1, rec, "FMT2", NULL, 0); bcf_translate(hdr2, hdr1, rec); if ( bcf_write(fp, hdr2, rec)!=0 ) error("Faild to write to %s\n", fname); // Clean bcf_destroy1(rec); bcf_hdr_destroy(hdr1); bcf_hdr_destroy(hdr2); int ret; if ( (ret=hts_close(fp)) ) { fprintf(stderr,"hts_close(%s): non-zero status %d\n",fname,ret); exit(ret); } return 0; }
int main_view(int argc, char *argv[]) { int i, c, n_files = 0, out_bcf = 0, clevel = -1, multi_flag = 0, excl = 0, not_vcf = 0, in_mem = 0, u_set = 0; long seekn = -1, n_rec = LONG_MAX, n_read = 0; bgtm_t *bm = 0; bcf1_t *b; htsFile *out = 0; char modew[8], *reg = 0, *site_flt = 0; void *bed = 0; int n_groups = 0; char *gexpr[BGT_MAX_GROUPS], *aexpr = 0, *dbfn = 0, *fmt = 0; bgt_file_t **files = 0; fmf_t *vardb = 0; while ((c = getopt(argc, argv, "ubs:r:l:CMGB:ef:g:a:i:n:SHt:d:")) >= 0) { if (c == 'b') out_bcf = 1; else if (c == 'r') reg = optarg; else if (c == 'l') clevel = atoi(optarg); else if (c == 'e') excl = 1; else if (c == 'u') u_set = 1; else if (c == 'B') bed = bed_read(optarg); else if (c == 'C') multi_flag |= BGT_F_SET_AC; else if (c == 'G') multi_flag |= BGT_F_NO_GT; else if (c == 'S') multi_flag |= BGT_F_NO_GT | BGT_F_CNT_AL, not_vcf = 1; else if (c == 'H') multi_flag |= BGT_F_NO_GT | BGT_F_CNT_HAP, not_vcf = 1; else if (c == 'M') in_mem = 1; else if (c == 'i') seekn = atol(optarg) - 1; else if (c == 'n') n_rec = atol(optarg); else if (c == 'f') site_flt = optarg; else if (c == 't') fmt = optarg, not_vcf = 1; else if (c == 'd') dbfn = optarg; else if (c == 's' && n_groups < BGT_MAX_GROUPS) gexpr[n_groups++] = optarg; else if (c == 'a') aexpr = optarg; } if (n_rec < 0) { fprintf(stderr, "[E::%s] option -n must be at least 0.\n", __func__); return 1; } if (clevel > 9) clevel = 9; if (u_set) clevel = 0, out_bcf = 1; if (n_groups > 1) multi_flag |= BGT_F_SET_AC; if (argc - optind < 1) { fprintf(stderr, "Usage: bgt %s [options] <bgt-prefix> [...]", argv[0]); fputc('\n', stderr); fprintf(stderr, "Options:\n"); fprintf(stderr, " Sample selection:\n"); fprintf(stderr, " -s EXPR samples list (,sample1,sample2 or a file or expr; see Notes below) [all]\n"); fprintf(stderr, " Site selection:\n"); fprintf(stderr, " -r STR region [all]\n"); fprintf(stderr, " -B FILE extract variants overlapping BED FILE []\n"); fprintf(stderr, " -e exclude variants overlapping BED FILE (effective with -B)\n"); fprintf(stderr, " -i INT process from the INT-th record (1-based) []\n"); fprintf(stderr, " -n INT process at most INT records []\n"); fprintf(stderr, " -d FILE variant annotations in FMF (to work with -a) []\n"); fprintf(stderr, " -M load variant annotations in RAM (only with -d)\n"); fprintf(stderr, " -a EXPR alleles list chr:1basedPos:refLen:seq (,allele1,allele2 or a file or expr) []\n"); fprintf(stderr, " -f STR frequency filters []\n"); fprintf(stderr, " VCF output:\n"); fprintf(stderr, " -b BCF output (effective without -S/-H)\n"); fprintf(stderr, " -l INT compression level for BCF [default]\n"); fprintf(stderr, " -u equivalent to -bl0 (overriding -b and -l)\n"); fprintf(stderr, " -G don't output sample genotypes\n"); fprintf(stderr, " -C write AC/AN to the INFO field (auto applied with -f or multipl -s)\n"); fprintf(stderr, " Non-VCF output:\n"); fprintf(stderr, " -S show samples with a set of alleles (with -a)\n"); fprintf(stderr, " -H count of haplotypes with a set of alleles (with -a)\n"); fprintf(stderr, " -t STR comma-delimited list of fields to output. Accepted variables:\n"); fprintf(stderr, " AC, AN, AC#, AN#, CHROM, POS, END, REF, ALT (# for a group number)\n"); fprintf(stderr, "Notes:\n"); fprintf(stderr, " For option -s/-a, EXPR can be one of:\n"); fprintf(stderr, " 1) comma-delimited list following a colon/comma. e.g. -s,NA12878,NA12044\n"); fprintf(stderr, " 2) space-delimited file with the first column giving a sample/allele name. e.g. -s list.txt\n"); fprintf(stderr, " 3) expression if .spl/-d file contains metadata. e.g.: -s\"gender=='M'&&population!='CEU'\"\n"); fprintf(stderr, " If multiple -s is specified, the AC/AN of the first group will be written to VCF INFO AC1/AN1,\n"); fprintf(stderr, " the second to AC2/AN2, etc.\n"); return 1; } if (dbfn && in_mem) vardb = fmf_read(dbfn), dbfn = 0; if ((multi_flag&(BGT_F_CNT_AL|BGT_F_CNT_HAP)) && aexpr == 0) { fprintf(stderr, "[E::%s] -a must be specified when -S/-H is in use.\n", __func__); return 1; } n_files = argc - optind; files = (bgt_file_t**)calloc(n_files, sizeof(bgt_file_t*)); for (i = 0; i < n_files; ++i) { files[i] = bgt_open(argv[optind+i]); if (files[i] == 0) { fprintf(stderr, "[E::%s] failed to open BGT with prefix '%s'\n", __func__, argv[optind+i]); return 1; // FIXME: memory leak } } bm = bgtm_reader_init(n_files, files); bgtm_set_flag(bm, multi_flag); if (site_flt && bgtm_set_flt_site(bm, site_flt) != 0) { fprintf(stderr, "[E::%s] failed to set frequency filters. Syntax error?\n", __func__); return 1; } if (reg && bgtm_set_region(bm, reg) < 0) { fprintf(stderr, "[E::%s] failed to set region. Region format error?\n", __func__); return 1; } if (bed) bgtm_set_bed(bm, bed, excl); if (fmt && bgtm_set_table(bm, fmt) < 0) { fprintf(stderr, "[E::%s] failed to set tabular output.\n", __func__); return 1; } if (seekn > 0) bgtm_set_start(bm, seekn); if (aexpr) { int n_al; n_al = bgtm_set_alleles(bm, aexpr, vardb, dbfn); if (n_al < 0) { fprintf(stderr, "[E::%s] failed to set alleles.\n", __func__); return 1; } else if (n_al == 0) fprintf(stderr, "[W::%s] no alleles selected.\n", __func__); } for (i = 0; i < n_groups; ++i) { if (bgtm_add_group(bm, gexpr[i]) < 0) { fprintf(stderr, "[E::%s] failed to add sample group '%s'.\n", __func__, gexpr[i]); return 1; } } bgtm_prepare(bm); // bgtm_prepare() generates the VCF header if (!not_vcf) { strcpy(modew, "w"); if (out_bcf) strcat(modew, "b"); sprintf(modew + strlen(modew), "%d", clevel); out = hts_open("-", modew, 0); vcf_hdr_write(out, bm->h_out); } b = bcf_init1(); while (bgtm_read(bm, b) >= 0 && n_read < n_rec) { if (out) vcf_write1(out, bm->h_out, b); if (fmt && bm->n_fields > 0) puts(bm->tbl_line.s); ++n_read; } bcf_destroy1(b); if (not_vcf && bm->n_aal > 0) { if (bm->flag & BGT_F_CNT_HAP) { bgt_hapcnt_t *hc; int n_hap; char *s; hc = bgtm_hapcnt(bm, &n_hap); s = bgtm_hapcnt_print_destroy(bm, n_hap, hc); fputs(s, stdout); free(s); } if (bm->flag & BGT_F_CNT_AL) { char *s; if ((s = bgtm_alcnt_print(bm)) != 0) fputs(s, stdout); free(s); } } if (out) hts_close(out); bgtm_reader_destroy(bm); if (bed) bed_destroy(bed); for (i = 0; i < n_files; ++i) bgt_close(files[i]); free(files); if (vardb) fmf_destroy(vardb); return 0; }