static void destroy_data(args_t *args) { if ( args->flag & CF_CCALL ) ccall_destroy(&args->aux); else if ( args->flag & CF_MCALL ) mcall_destroy(&args->aux); else if ( args->flag & CF_QCALL ) qcall_destroy(&args->aux); int i; for (i=0; i<args->nsamples; i++) free(args->samples[i]); if ( args->aux.fams ) { for (i=0; i<args->aux.nfams; i++) free(args->aux.fams[i].name); free(args->aux.fams); } if ( args->missed_line ) bcf_destroy(args->missed_line); ploidy_destroy(args->ploidy); if ( args->gvcf.line ) bcf_destroy(args->gvcf.line); free(args->gvcf.gt); free(args->gvcf.dp); free(args->sex2ploidy); free(args->sex2ploidy_prev); free(args->samples); free(args->samples_map); free(args->sample2sex); free(args->aux.ploidy); bcf_hdr_destroy(args->aux.hdr); hts_close(args->out_fh); bcf_sr_destroy(args->aux.srs); }
static void process(args_t *args) { bcf1_t *rec = bcf_sr_get_line(args->sr,0); bcf_unpack(rec, BCF_UN_ALL); int i, site_pass = 1; const uint8_t *smpl_pass = NULL; if ( args->filter ) { site_pass = filter_test(args->filter, rec, &smpl_pass); if ( args->filter_logic & FLT_EXCLUDE ) site_pass = site_pass ? 0 : 1; } bcf1_t *out = NULL; for (i=0; i<rec->n_sample; i++) { if ( !args->fh[i] ) continue; if ( !smpl_pass && !site_pass ) continue; if ( smpl_pass ) { int pass = args->filter_logic & FLT_EXCLUDE ? ( smpl_pass[i] ? 0 : 1) : smpl_pass[i]; if ( !pass ) continue; } if ( !out ) out = rec_set_info(args, rec); rec_set_format(args, rec, i, out); bcf_write(args->fh[i], args->hdr_out, out); } if ( out ) bcf_destroy(out); }
BlockQuantify::BlockQuantifyImpl::~BlockQuantifyImpl() { for(auto x : variants) { bcf_destroy(x); } }
/** * Returns true if chrom:start1-end1 overlaps with a region in the file. */ bool OrderedBCFOverlapMatcher::overlaps_with(std::string& chrom, int32_t start1, int32_t end1) { bool overlaps = false; //moves to new chromosome if (current_interval.seq!=chrom) { buffer.clear(); current_interval.set(chrom); odr->jump_to_interval(current_interval); std::cerr << "Jumped to chromosome " << chrom << "\n"; while (odr->read(v)) { if (bcf_get_end_pos1(v)<start1) continue; overlaps = overlaps || (bcf_get_pos1(v)<=end1); buffer.push_back(v); if (bcf_get_pos1(v)>end1) break; v = bcf_init(); } } else { //scythe preceding bed records std::list<bcf1_t*>::iterator i = buffer.begin(); while (i!=buffer.end()) { if (bcf_get_end_pos1(*i)<start1) { bcf_destroy(*i); i = buffer.erase(i); continue; } overlaps = (bcf_get_pos1(*i)<=end1); break; } if (!overlaps) { if (buffer.empty()) { while (odr->read(v)) { if (bcf_get_end_pos1(*i)<start1) continue; overlaps = overlaps || (bcf_get_pos1(*i)<=end1); buffer.push_back(v); if (bcf_get_pos1(v)>end1) break; v = bcf_init(); } } } } return overlaps; };
void call_decomp_destroy(CallDecomp *dc) { alignment_free(dc->aln); needleman_wunsch_free(dc->nw_aligner); ctx_free(dc->scoring); bcf_destroy(dc->v); strbuf_dealloc(&dc->sbuf); ctx_free(dc); }
void vcfbuf_destroy(vcfbuf_t *buf) { int i; for (i=0; i<buf->rbuf.m; i++) if ( buf->vcf[i].rec ) bcf_destroy(buf->vcf[i].rec); free(buf->vcf); free(buf->prune.farr); free(buf->prune.vrec); free(buf->prune.ac); free(buf->prune.idx); free(buf); }
/** * Closes files. */ void BCFSyncedReader::close() { for (size_t i=0; i<nfiles; ++i) { bcf_close(files[i]); bcf_hdr_destroy(hdrs[i]); bcf_itr_destroy(itrs[i]); } while (pool.size()!=0) { bcf_destroy(pool.front()); pool.pop_front(); } }
static int view_vcf(hFILE *hfp, const char *filename) { vcfFile *in = hts_hopen(hfp, filename, "r"); if (in == NULL) return 0; vcfFile *out = dup_stdout("w"); bcf_hdr_t *hdr = bcf_hdr_read(in); if (show_headers) bcf_hdr_write(out, hdr); if (mode == view_all) { bcf1_t *rec = bcf_init(); while (bcf_read(in, hdr, rec) >= 0) bcf_write(out, hdr, rec); bcf_destroy(rec); } bcf_hdr_destroy(hdr); hts_close(out); hts_close(in); return 1; }
static void destroy_data(args_t *args) { int i; for (i=0; i<args->nfnames; i++) free(args->fnames[i]); free(args->fnames); if ( args->files ) bcf_sr_destroy(args->files); if ( hts_close(args->out_fh)!=0 ) error("hts_close error\n"); bcf_hdr_destroy(args->out_hdr); free(args->seen_seq); free(args->start_pos); free(args->swap_phase); for (i=0; i<args->mbuf; i++) bcf_destroy(args->buf[i]); free(args->buf); free(args->GTa); free(args->GTb); free(args->nmatch); free(args->nmism); free(args->phase_qual); free(args->phase_set); }
static int query_regions(args_t *args, char *fname, char **regs, int nregs) { int i; htsFile *fp = hts_open(fname,"r"); if ( !fp ) error("Could not read %s\n", fname); enum htsExactFormat format = hts_get_format(fp)->format; regidx_t *reg_idx = NULL; if ( args->targets_fname ) { reg_idx = regidx_init(args->targets_fname, NULL, NULL, 0, NULL); if ( !reg_idx ) error("Could not read %s\n", args->targets_fname); } if ( format == bcf ) { htsFile *out = hts_open("-","w"); if ( !out ) error("Could not open stdout\n", fname); hts_idx_t *idx = bcf_index_load(fname); if ( !idx ) error("Could not load .csi index of %s\n", fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Could not read the header: %s\n", fname); if ( args->print_header ) bcf_hdr_write(out,hdr); if ( !args->header_only ) { bcf1_t *rec = bcf_init(); for (i=0; i<nregs; i++) { hts_itr_t *itr = bcf_itr_querys(idx,hdr,regs[i]); while ( bcf_itr_next(fp, itr, rec) >=0 ) { if ( reg_idx && !regidx_overlap(reg_idx, bcf_seqname(hdr,rec),rec->pos,rec->pos+rec->rlen-1, NULL) ) continue; bcf_write(out,hdr,rec); } tbx_itr_destroy(itr); } bcf_destroy(rec); } if ( hts_close(out) ) error("hts_close returned non-zero status for stdout\n"); bcf_hdr_destroy(hdr); hts_idx_destroy(idx); } else if ( format==vcf || format==sam || format==unknown_format ) { tbx_t *tbx = tbx_index_load(fname); if ( !tbx ) error("Could not load .tbi/.csi index of %s\n", fname); kstring_t str = {0,0,0}; if ( args->print_header ) { while ( hts_getline(fp, KS_SEP_LINE, &str) >= 0 ) { if ( !str.l || str.s[0]!=tbx->conf.meta_char ) break; puts(str.s); } } if ( !args->header_only ) { int nseq; const char **seq = NULL; if ( reg_idx ) seq = tbx_seqnames(tbx, &nseq); for (i=0; i<nregs; i++) { hts_itr_t *itr = tbx_itr_querys(tbx, regs[i]); if ( !itr ) continue; while (tbx_itr_next(fp, tbx, itr, &str) >= 0) { if ( reg_idx && !regidx_overlap(reg_idx,seq[itr->curr_tid],itr->curr_beg,itr->curr_end, NULL) ) continue; puts(str.s); } tbx_itr_destroy(itr); } free(seq); } free(str.s); tbx_destroy(tbx); } else if ( format==bam ) error("Please use \"samtools view\" for querying BAM files.\n"); if ( reg_idx ) regidx_destroy(reg_idx); if ( hts_close(fp) ) error("hts_close returned non-zero status: %s\n", fname); for (i=0; i<nregs; i++) free(regs[i]); free(regs); return 0; }
int main(int argc, char **argv) { if (argc < 3) { fprintf(stderr,"%s <bcf file> <num vars>\n", argv[0]); return 1; } char *fname = argv[1]; uint32_t num_vars = atoi(argv[2]); htsFile *fp = hts_open(fname,"rb"); bcf_hdr_t *hdr = bcf_hdr_read(fp); bcf1_t *line = bcf_init1(); int32_t *gt_p = NULL; uint32_t num_inds = bcf_hdr_nsamples(hdr); int32_t i, j, k, ntmp = 0, int_i = 0, two_bit_i = 0, sum, t_sum = 0; uint32_t num_ind_ints = 1 + ((num_inds - 1) / 16); pri_queue q = priq_new(0); priority p; uint32_t *packed_ints = (uint32_t *) calloc(num_ind_ints, sizeof(uint32_t)); FILE *gt_of = fopen("gt.tmp.packed","wb"); FILE *md_of = fopen("md.tmp.packed","w"); uint32_t *md_index = (uint32_t *) malloc(num_vars * sizeof(uint32_t)); uint32_t md_i = 0; unsigned long t_bcf_read = 0, t_bcf_dup = 0, t_bcf_unpack = 0, t_bcf_get_genotypes = 0, t_bcf_hdr_nsamples = 0, t_q = 0, t_write = 0, t_get_md = 0, t_md_write = 0, t_pack = 0; for (i = 0; i < num_vars; ++i) { sum = 0; int_i = 0; two_bit_i = 0; int r = bcf_read(fp, hdr, line); // Copy bcf1_t *t_line = bcf_dup(line); // Unpack bcf_unpack(t_line, BCF_UN_ALL); // Get metadata size_t len = strlen(bcf_hdr_id2name(hdr, t_line->rid)) + 10 + // max length of pos strlen(t_line->d.id) + strlen(t_line->d.allele[0]) + strlen(t_line->d.allele[1]) + 4; //tabs char *md = (char *) malloc(len * sizeof(char)); sprintf(md, "%s\t%d\t%s\t%s\t%s", bcf_hdr_id2name(hdr, t_line->rid), t_line->pos + 1, t_line->d.id, t_line->d.allele[0], t_line->d.allele[1]); // Write metadata md_i += strlen(md); md_index[i] = md_i; fprintf(md_of, "%s", md); // Get gentotypes uint32_t num_gts_per_sample = bcf_get_genotypes(hdr, t_line, >_p, &ntmp); num_gts_per_sample /= num_inds; int32_t *gt_i = gt_p; // Pack genotypes for (j = 0; j < num_inds; ++j) { uint32_t gt = 0; for (k = 0; k < num_gts_per_sample; ++k) { gt += bcf_gt_allele(gt_i[k]); } packed_ints[int_i] += gt << (30 - 2*two_bit_i); two_bit_i += 1; if (two_bit_i == 16) { two_bit_i = 0; int_i += 1; } sum += gt; gt_i += num_gts_per_sample; } // Get a priority for the variant based on the sum and number of // leading zeros p.sum = sum; uint32_t prefix_len = 0; j = 0; while ((j < num_ind_ints) && (packed_ints[j] == 0)){ prefix_len += 32; j += 1; } if (j < num_ind_ints) prefix_len += nlz1(packed_ints[j]); // Push it into the q p.len = prefix_len; int *j = (int *) malloc (sizeof(int)); j[0] = i; priq_push(q, j, p); // Write to file fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,gt_of); memset(packed_ints, 0, num_ind_ints*sizeof(uint32_t)); t_sum += sum; bcf_destroy(t_line); free(md); } fclose(gt_of); fclose(md_of); md_of = fopen("md.tmp.packed","r"); FILE *md_out = fopen("md.bim","w"); gt_of = fopen("gt.tmp.packed","rb"); FILE *s_gt_of = fopen("s.gt.tmp.packed","wb"); // Get variants in order and rewrite a variant-major sorted matrix while ( priq_top(q, &p) != NULL ) { int *d = priq_pop(q, &p); uint32_t start = 0; if (*d != 0) start = md_index[*d - 1]; uint32_t len = md_index[*d] - start; fseek(md_of, start*sizeof(char), SEEK_SET); char buf[len+1]; fread(buf, sizeof(char), len, md_of); buf[len] = '\0'; fseek(gt_of, (*d)*num_ind_ints*sizeof(uint32_t), SEEK_SET); fread(packed_ints, sizeof(uint32_t), num_ind_ints, gt_of); fwrite(packed_ints, sizeof(uint32_t), num_ind_ints,s_gt_of); fprintf(md_out, "%s\n", buf); } fclose(md_out); fclose(md_of); fclose(gt_of); fclose(s_gt_of); /* * In a packed-int variant-major matrix there will be a num_vars * number of rows, and a num_inds number of values packed into * num_inds_ints number of intergers. For examples, 16 rows of 16 values * will be 16 ints, where each int encodes 16 values. * */ uint32_t num_var_ints = 1 + ((num_vars - 1) / 16); uint32_t *I_data = (uint32_t *)calloc(num_var_ints*16,sizeof(uint32_t)); uint32_t **I = (uint32_t **)malloc(16*sizeof(uint32_t*)); for (i = 0; i < 16; ++i) I[i] = I_data + i*num_var_ints; uint32_t I_i = 0, I_int_i = 0; uint32_t v; s_gt_of = fopen("s.gt.tmp.packed","rb"); FILE *rs_gt_of = fopen("r.s.gt.tmp.packed","wb"); // Write these to values to that this is a well-formed uncompressed // packed int binary file (ubin) file fwrite(&num_vars, sizeof(uint32_t), 1, rs_gt_of); fwrite(&num_inds, sizeof(uint32_t), 1, rs_gt_of); /* * we need to loop over the columns in the v-major file. * There are num_vars rows, and num_ind_ints 16-ind packed columns * * In this loop : * i: cols in var-major form, rows in ind-major form * j: rows in var-major form, cols in ind-major form */ uint32_t num_inds_to_write = num_inds; for (i = 0; i < num_ind_ints; ++i) { // loop over each int col for (j = 0; j < num_vars; ++j) { // loop over head row in that col // skip to the value at the row/col fseek(s_gt_of, j*num_ind_ints*sizeof(uint32_t) + //row i*sizeof(uint32_t), //col SEEK_SET); fread(&v, sizeof(uint32_t), 1, s_gt_of); // one int corresponds to a col of 16 two-bit values // two_bit_i will move across the cols for (two_bit_i = 0; two_bit_i < 16; ++two_bit_i) { I[two_bit_i][I_i] += ((v >> (30 - 2*two_bit_i)) & 3) << (30 - 2*I_int_i); } I_int_i += 1; if (I_int_i == 16) { I_i += 1; I_int_i = 0; } } // When we are at the end of the file, and the number of lines // is not a factor of 16, only write out the lines that contain values if (num_inds_to_write >= 16) { fwrite(I_data, sizeof(uint32_t), num_var_ints*16, rs_gt_of); num_inds_to_write -= 16; } else { fwrite(I_data, sizeof(uint32_t), num_var_ints*num_inds_to_write, rs_gt_of); } memset(I_data, 0, num_var_ints*16*sizeof(uint32_t)); I_int_i = 0; I_i = 0; } fclose(s_gt_of); fclose(rs_gt_of); free(md_index); free(packed_ints); }
static void init_data(args_t *args) { bcf1_t *line = NULL; // With phased concat, the chunks overlap and come in the right order. To // avoid opening all files at once, store start positions to recognise need // for the next one. This way we can keep only two open chunks at once. if ( args->phased_concat ) { args->start_pos = (int*) malloc(sizeof(int)*args->nfnames); line = bcf_init(); } kstring_t str = {0,0,0}; int i, prev_chrid = -1; for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); args->out_hdr = bcf_hdr_merge(args->out_hdr,hdr); if ( bcf_hdr_nsamples(hdr) != bcf_hdr_nsamples(args->out_hdr) ) error("Different number of samples in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); int j; for (j=0; j<bcf_hdr_nsamples(hdr); j++) if ( strcmp(args->out_hdr->samples[j],hdr->samples[j]) ) error("Different sample names in %s. Perhaps \"bcftools merge\" is what you are looking for?\n", args->fnames[i]); if ( args->phased_concat ) { int ret = bcf_read(fp, hdr, line); if ( ret!=0 ) args->start_pos[i] = -2; // empty file else { int chrid = bcf_hdr_id2int(args->out_hdr,BCF_DT_CTG,bcf_seqname(hdr,line)); args->start_pos[i] = chrid==prev_chrid ? line->pos : -1; prev_chrid = chrid; } } bcf_hdr_destroy(hdr); hts_close(fp); } free(str.s); if ( line ) bcf_destroy(line); args->seen_seq = (int*) calloc(args->out_hdr->n[BCF_DT_CTG],sizeof(int)); if ( args->phased_concat ) { bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PQ,Number=1,Type=Integer,Description=\"Phasing Quality (bigger is better)\">"); bcf_hdr_append(args->out_hdr,"##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set\">"); } if (args->record_cmd_line) bcf_hdr_append_version(args->out_hdr, args->argc, args->argv, "bcftools_concat"); args->out_fh = hts_open(args->output_fname,hts_bcf_wmode(args->output_type)); if ( args->out_fh == NULL ) error("Can't write to \"%s\": %s\n", args->output_fname, strerror(errno)); if ( args->n_threads ) hts_set_threads(args->out_fh, args->n_threads); bcf_hdr_write(args->out_fh, args->out_hdr); if ( args->allow_overlaps ) { args->files = bcf_sr_init(); args->files->require_index = 1; if ( args->regions_list ) { if ( bcf_sr_set_regions(args->files, args->regions_list, args->regions_is_file)<0 ) error("Failed to read the regions: %s\n", args->regions_list); } if ( args->remove_dups ) { if ( !strcmp(args->remove_dups,"snps") ) args->files->collapse |= COLLAPSE_SNPS; else if ( !strcmp(args->remove_dups,"indels") ) args->files->collapse |= COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"both") ) args->files->collapse |= COLLAPSE_SNPS | COLLAPSE_INDELS; else if ( !strcmp(args->remove_dups,"any") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"all") ) args->files->collapse |= COLLAPSE_ANY; else if ( !strcmp(args->remove_dups,"none") ) args->files->collapse = COLLAPSE_NONE; else error("The -D string \"%s\" not recognised.\n", args->remove_dups); } for (i=0; i<args->nfnames; i++) if ( !bcf_sr_add_reader(args->files,args->fnames[i]) ) error("Failed to open %s: %s\n", args->fnames[i],bcf_sr_strerror(args->files->errnum)); } else if ( args->phased_concat ) { // Remove empty files from the list int nok = 0; while (1) { while ( nok<args->nfnames && args->start_pos[nok]!=-2 ) nok++; if ( nok==args->nfnames ) break; i = nok; while ( i<args->nfnames && args->start_pos[i]==-2 ) i++; if ( i==args->nfnames ) break; int tmp = args->start_pos[nok]; args->start_pos[nok] = args->start_pos[i]; args->start_pos[i] = tmp; char *str = args->fnames[nok]; args->fnames[nok] = args->fnames[i]; args->fnames[i] = str; } for (i=nok; i<args->nfnames; i++) free(args->fnames[i]); args->nfnames = nok; for (i=1; i<args->nfnames; i++) if ( args->start_pos[i-1]!=-1 && args->start_pos[i]!=-1 && args->start_pos[i]<args->start_pos[i-1] ) error("The files not in ascending order: %d in %s, %d in %s\n", args->start_pos[i-1]+1,args->fnames[i-1],args->start_pos[i]+1,args->fnames[i]); args->prev_chr = -1; args->swap_phase = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmatch = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->nmism = (int*) calloc(bcf_hdr_nsamples(args->out_hdr),sizeof(int)); args->phase_qual = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->phase_set = (int32_t*) malloc(bcf_hdr_nsamples(args->out_hdr)*sizeof(int32_t)); args->files = bcf_sr_init(); args->files->require_index = 1; args->ifname = 0; } }
/** * Flush records. */ void BCFSingleGenotypingBufferedReader::flush(bam_hdr_t *h, bam1_t *s, bool flush_all) { if (flush_all) { //read all the remaining from the reference genotyping file bcf1_t *v = bcf_init(); while (odr->read(v)) { int32_t vtype = vm->classify_variant(odr->hdr, v, variant); GenotypingRecord* g = create_genotyping_record(odr->hdr, v, 2, variant); buffer.push_back(g); v = bcf_init(); } bcf_destroy(v); GenotypingRecord* g; while (!buffer.empty()) { g = buffer.front(); // genotype_and_print(g); delete g; buffer.pop_front(); } } else { //std::cerr << "partial flush\n"; uint32_t tid = bam_get_tid(s); GenotypingRecord* g; while (!buffer.empty()) { g = buffer.front(); if (tid==g->rid) { if (bam_get_pos1(s) > g->end1) { // genotype_and_print(g); delete g; buffer.pop_front(); } else { return; } } else if (tid>g->rid) { // genotype_and_print(g); delete g; buffer.pop_front(); } else { return; } } } }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); int storeSize = 100; int delStore[2][100] = {{0},{0}}; typedef char * mstring; while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { //MDW start //for each position in the pileup column int charLen = 16; int countChars[ charLen ][2]; int countiChars[ charLen ][2]; int countGap[2]={0,0}; //double qvTotal=0; int numStruck=0; int numGood=0; int tti; int ttj; mstring insAllele[100]; int insAlleleCnt[100]; int sf=0; int flag=0; //typedef char * string; char insStr0[10000]; int iCnt0=0; char insStr1[10000]; int iCnt1=0; char delStr0[10000]; int dCnt0=0; char delStr1[10000]; int dCnt1=0; float qposP[10000]; int qposCnt=0; //initialize with zeros for(tti=0;tti<charLen;tti++){ countChars[tti][0]=0; countChars[tti][1]=0; } // define repeat length here; look back up to 10 prior positions // start one position away. int replC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos-1])==toupper(ref[pos-tti])){ replC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int reprC=0; // for(tti=1;tti<=15;tti++){ // check for greater than zero if(toupper(ref[pos+1])==toupper(ref[pos+tti])){ reprC++; }else{ // breaks the chain at first non identical to current position not strict homopolymer break; } } int repT = replC; if(replC < reprC){ repT=reprC; } for (j = 0; j < n_plp[i]; ++j){ const bam_pileup1_t *p = plp[i] + j; /* SAME LOGIC AS pileup_seq() */ if(p->is_refskip){ // never count intron gaps in numStruck continue; } if(p->is_del){ // skip deletion gap, after first position which is the first aligned char continue; } if( p->b->core.qual < conf->min_mqToCount || // mapping quality conf->maxrepC < (repT) || // max homopolymer run, this will not (!p->is_del && bam1_qual(p->b)[p->qpos] < conf->min_baseQ) || // base quality for matches p->alignedQPosBeg <= (conf->trimEnd ) || p->alignedQPosEnd <= (conf->trimEnd ) || // trimEnd is 1-based p->zf == 1 || // fusion tag p->ih > conf->maxIH || // max hit index (p->nmd > conf->maxNM) || // max mismatch (conf->flagFilter == 1 && !(p->b->core.flag&BAM_FPROPER_PAIR)) || // optionally keep only proper pairs (conf->flagFilter == 2 && p->b->core.flag&BAM_FSECONDARY) || // optionally strike secondary (conf->flagFilter == 3 && p->b->core.flag&BAM_FDUP) || // optionally strike dup (conf->flagFilter == 4 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY)) || // optionally strike secondary or dup (conf->flagFilter == 5 && (p->b->core.flag&BAM_FDUP || p->b->core.flag&BAM_FSECONDARY || p->b->core.flag&BAM_FQCFAIL || !(p->b->core.flag&BAM_FPROPER_PAIR) )) // optionally strike secondary, dup and QCfail ){ numStruck++; continue; } //printf("repT=%d: %d %c %c %c %c \n",repT,p->indel,ref[pos],ref[pos-1],ref[pos-2],ref[pos-3]); if(!p->is_del && p->indel==0){ countChars[ bam1_seqi(bam1_seq(p->b), p->qpos) ][ bam1_strand(p->b) ] ++; numGood++; }else if(p->is_refskip){ countGap[ bam1_strand(p->b) ]++; } if(p->indel<0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr0[dCnt0] = ref[pos+tti]; dCnt0++; } delStr0[dCnt0] = ','; dCnt0++; }else{ for(tti=1;tti<= -p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position delStr1[dCnt1] = ref[pos+tti]; dCnt1++; } delStr1[dCnt1] = ','; dCnt1++; } }else if(p->indel>0){ numGood++; if(bam1_strand(p->b) ==0){ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr0[iCnt0] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt0++; } insStr0[iCnt0] = ','; iCnt0++; }else{ for(tti=1;tti<= p->indel; tti++) { // current spot, starting at 0 in store, because indel<0 refers to next position insStr1[iCnt1] = bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos + tti)]; iCnt1++; } insStr1[iCnt1] = ','; iCnt1++; } } //calculate position of variant within aligned read - no soft clips if( toupper(ref[pos]) != toupper(bam_nt16_rev_table[bam1_seqi(bam1_seq(p->b), p->qpos)]) || p->indel>0 || p->indel<0 ){ //distance to end; calculate distance to end of aligned read. removes soft clips. int distToEnd = (p->alignedQPosBeg < p->alignedQPosEnd) ? p->alignedQPosBeg : p->alignedQPosEnd; qposP[qposCnt] = distToEnd; qposCnt++; // printf("id=%s, pos=%d",bam1_qname(p->b),distToEnd); } } // //print A,C,G,T, by +/- printf("\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d", countChars[1][0],countChars[1][1], countChars[2][0],countChars[2][1], countChars[4][0],countChars[4][1], countChars[8][0],countChars[8][1], countChars[7][0],countChars[7][1]); putchar('\t'); for(tti=0;tti<dCnt0;tti++){ putchar(delStr0[tti]); } putchar('\t'); for(tti=0;tti<dCnt1;tti++){ putchar(delStr1[tti]); } putchar('\t'); for(tti=0;tti<iCnt0;tti++){ putchar(insStr0[tti]); } putchar('\t'); for(tti=0;tti<iCnt1;tti++){ putchar(insStr1[tti]); } printf("\t%d\t%d",numGood,numStruck); // get non-ref qpos variation float medqpos = -1; float medAbsDev = -1; if(qposCnt>0){ medqpos = median(qposCnt,qposP); float absDev[qposCnt]; for(tti=0;tti<qposCnt;tti++){ absDev[tti] = abs(medqpos - qposP[tti]); } medAbsDev = median(qposCnt-1,absDev); } printf("\t%f",medAbsDev); ///END MDW } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
int main(int argc, char **argv) { if ( argc == 1 ) error("Usage : bed_annos -c config.json -O z -o output.vcf.gz input.vcf.gz"); int i; for ( i = 1; i < argc; ) { const char *a = argv[i++]; const char **var = 0; if ( strcmp(a, "-c") == 0 ) var = &json_fname; else if ( strcmp(a, "-O") == 0 ) var = &output_fname_type; else if ( strcmp(a, "-o") == 0 ) var = &output_fname; if ( var != 0 ) { if ( i == argc ) error("Missing an argument after %s", a); *var = argv[i++]; continue; } if ( input_fname == 0 ) { input_fname = a; continue; } error("Unknown argument : %s.", a); } struct vcfanno_config *con = vcfanno_config_init(); if ( vcfanno_load_config(con, json_fname) != 0 ) error("Failed to load configure file. %s : %s", json_fname, strerror(errno)); vcfanno_config_debug(con); if ( con->beds.n_beds == 0) error("No bed database specified."); if ( input_fname == 0 && (!isatty(fileno(stdin))) ) input_fname = "-"; if ( input_fname == 0 ) error("No input file."); int out_type = FT_VCF; if ( output_fname_type != 0 ) { switch (output_fname_type[0]) { case 'b': out_type = FT_BCF_GZ; break; case 'u': out_type = FT_BCF; break; case 'z': out_type = FT_VCF_GZ; break; case 'v': out_type = FT_VCF; break; default : error("The output type \"%d\" not recognised\n", out_type); }; } htsFile *fp = hts_open(input_fname, "r"); if ( fp == NULL ) error("Failed to open %s : %s.", input_fname, strerror(errno)); htsFormat type = *hts_get_format(fp); if ( type.format != vcf && type.format != bcf ) error("Unsupported input format. %s", input_fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( hdr == NULL ) error("Failed to parse header."); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); htsFile *fout = output_fname == 0 ? hts_open("-", hts_bcf_wmode(out_type)) : hts_open(output_fname, hts_bcf_wmode(out_type)); struct beds_options opts = { .beds_is_inited = 0,}; beds_options_init(&opts); opts.hdr_out = hdr_out; for ( i = 0; i < con->beds.n_beds; ++i ) { beds_database_add(&opts, con->beds.files[i].fname, con->beds.files[i].columns); } bcf_hdr_write(fout, hdr_out); bcf1_t *line = bcf_init(); while ( bcf_read(fp, hdr, line) == 0 ) { anno_beds_core(&opts, line); bcf_write(fout, hdr_out, line); } bcf_destroy(line); bcf_hdr_destroy(hdr); bcf_hdr_destroy(hdr_out); beds_options_destroy(&opts); hts_close(fp); hts_close(fout); return 0; }
/** * Destructor. */ GenotypingRecord::~GenotypingRecord() { if (v) bcf_destroy(v); }
static void concat(args_t *args) { int i; if ( args->phased_concat ) // phased concat { // keep only two open files at a time while ( args->ifname < args->nfnames ) { int new_file = 0; while ( args->files->nreaders < 2 && args->ifname < args->nfnames ) { if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); new_file = 1; args->ifname++; if ( args->start_pos[args->ifname-1]==-1 ) break; // new chromosome, start with only one file open if ( args->ifname < args->nfnames && args->start_pos[args->ifname]==-1 ) break; // next file starts on a different chromosome } // is there a line from the previous run? Seek the newly opened reader to that position int seek_pos = -1; int seek_chr = -1; if ( bcf_sr_has_line(args->files,0) ) { bcf1_t *line = bcf_sr_get_line(args->files,0); bcf_sr_seek(args->files, bcf_seqname(args->files->readers[0].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[0].header,line)); } else if ( new_file ) bcf_sr_seek(args->files,NULL,0); // set to start int nret; while ( (nret = bcf_sr_next_line(args->files)) ) { if ( !bcf_sr_has_line(args->files,0) ) // no input from the first reader { // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( ! bcf_sr_region_done(args->files,0) ) continue; phased_flush(args); bcf_sr_remove_reader(args->files, 0); } // Get a line to learn about current position for (i=0; i<args->files->nreaders; i++) if ( bcf_sr_has_line(args->files,i) ) break; bcf1_t *line = bcf_sr_get_line(args->files,i); // This can happen after bcf_sr_seek: indel may start before the coordinate which we seek to. if ( seek_chr>=0 && seek_pos>line->pos && seek_chr==bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)) ) continue; seek_pos = seek_chr = -1; // Check if the position overlaps with the next, yet unopened, reader int must_seek = 0; while ( args->ifname < args->nfnames && args->start_pos[args->ifname]!=-1 && line->pos >= args->start_pos[args->ifname] ) { must_seek = 1; if ( !bcf_sr_add_reader(args->files,args->fnames[args->ifname]) ) error("Failed to open %s: %s\n", args->fnames[args->ifname],bcf_sr_strerror(args->files->errnum)); args->ifname++; } if ( must_seek ) { bcf_sr_seek(args->files, bcf_seqname(args->files->readers[i].header,line), line->pos); seek_pos = line->pos; seek_chr = bcf_hdr_name2id(args->out_hdr, bcf_seqname(args->files->readers[i].header,line)); continue; } // We are assuming that there is a perfect overlap, sites which are not present in both files are dropped if ( args->files->nreaders>1 && !bcf_sr_has_line(args->files,1) && !bcf_sr_region_done(args->files,1) ) continue; phased_push(args, bcf_sr_get_line(args->files,0), args->files->nreaders>1 ? bcf_sr_get_line(args->files,1) : NULL); } if ( args->files->nreaders ) { phased_flush(args); while ( args->files->nreaders ) bcf_sr_remove_reader(args->files, 0); } } } else if ( args->files ) // combining overlapping files, using synced reader { while ( bcf_sr_next_line(args->files) ) { for (i=0; i<args->files->nreaders; i++) { bcf1_t *line = bcf_sr_get_line(args->files,i); if ( !line ) continue; bcf_translate(args->out_hdr, args->files->readers[i].header, line); bcf_write1(args->out_fh, args->out_hdr, line); if ( args->remove_dups ) break; } } } else // concatenating { kstring_t tmp = {0,0,0}; int prev_chr_id = -1, prev_pos; bcf1_t *line = bcf_init(); for (i=0; i<args->nfnames; i++) { htsFile *fp = hts_open(args->fnames[i], "r"); if ( !fp ) error("Failed to open: %s\n", args->fnames[i]); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to parse header: %s\n", args->fnames[i]); if ( !fp->is_bin && args->output_type&FT_VCF ) { line->max_unpack = BCF_UN_STR; // if VCF is on both input and output, avoid VCF to BCF conversion while ( hts_getline(fp, KS_SEP_LINE, &fp->line) >=0 ) { char *str = fp->line.s; while ( *str && *str!='\t' ) str++; tmp.l = 0; kputsn(fp->line.s,str-fp->line.s,&tmp); int chr_id = bcf_hdr_name2id(args->out_hdr, tmp.s); if ( chr_id<0 ) error("The sequence \"%s\" not defined in the header: %s\n(Quick workaround: index the file.)\n", tmp.s, args->fnames[i]); if ( prev_chr_id!=chr_id ) { prev_pos = -1; if ( args->seen_seq[chr_id] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", tmp.s); } char *end; int pos = strtol(str+1,&end,10) - 1; if ( end==str+1 ) error("Could not parse line: %s\n", fp->line.s); if ( prev_pos > pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", tmp.s); args->seen_seq[chr_id] = 1; prev_chr_id = chr_id; if ( vcf_write_line(args->out_fh, &fp->line)!=0 ) error("Failed to write %d bytes\n", fp->line.l); } } else { // BCF conversion is required line->max_unpack = 0; while ( bcf_read(fp, hdr, line)==0 ) { bcf_translate(args->out_hdr, hdr, line); if ( prev_chr_id!=line->rid ) { prev_pos = -1; if ( args->seen_seq[line->rid] ) error("\nThe chromosome block %s is not contiguous, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); } if ( prev_pos > line->pos ) error("The chromosome block %s is not sorted, consider running with -a.\n", bcf_seqname(args->out_hdr, line)); args->seen_seq[line->rid] = 1; prev_chr_id = line->rid; if ( bcf_write(args->out_fh, args->out_hdr, line)!=0 ) error("Failed to write\n"); } } bcf_hdr_destroy(hdr); hts_close(fp); } bcf_destroy(line); free(tmp.s); } }
/** * Collects sufficient statistics from read for variants to be genotyped. * * The VCF records in the buffer must never occur before */ void BCFSingleGenotypingBufferedReader::process_read(bam_hdr_t *h, bam1_t *s) { //wrap bam1_t in AugmentBAMRecord as.initialize(h, s); uint32_t tid = bam_get_tid(s); uint32_t beg1 = as.beg1; uint32_t end1 = as.end1; //collect statistics for variant records that are in the buffer and overlap with the read GenotypingRecord* g; for (std::list<GenotypingRecord*>::iterator i=buffer.begin(); i!=buffer.end(); ++i) { g = *i; // std::cerr << g->pos1 << " " << g->beg1 << " " << g->end1 << " "; //same chromosome if (tid==g->rid) { if (end1 < g->beg1) { //can terminate return; } else if (beg1 > g->end1) { //this should not occur if the buffer was flushed before invoking process read continue; } //else if (beg1 <= g->beg1 && g->end1 <= end1) else if (beg1 <= g->pos1 && g->pos1 <= end1) { // collect_sufficient_statistics(*i, as); } else { //other types of overlap, just ignore } // std::cerr << "\n"; } //prior chromosome else if (tid<g->rid) { //this should not occur if the buffer was flushed before invoking process read return; } //latter chromosome else if (tid>g->rid) { //in case if the buffer has a VCF record later in the list which matches it continue; } } //you will only reach here if a read occurs after or overlaps the last record in the buffer //adding new VCF records and collecting statistics if necessary bcf1_t *v = bcf_init(); while (odr->read(v)) { int32_t vtype = vm->classify_variant(odr->hdr, v, variant); g = create_genotyping_record(odr->hdr, v, 2, variant); buffer.push_back(g); if (tid==g->rid) { //if (end1>=g->beg1 && pos1<=g->end1) if (beg1 <= g->pos1 && g->pos1 <= end1) { // collect_sufficient_statistics(g, as); } } //VCF record occurs after the read if (tid < g->rid || end1 < g->beg1) { return; } else { v = bcf_init(); } } //this means end of file bcf_destroy(v); }
static int mpileup(mplp_conf_t *conf, int n, char **fn) { extern void *bcf_call_add_rg(void *rghash, const char *hdtext, const char *list); extern void bcf_call_del_rghash(void *rghash); mplp_aux_t **data; int i, tid, pos, *n_plp, tid0 = -1, beg0 = 0, end0 = 1u<<29, ref_len, ref_tid = -1, max_depth, max_indel_depth; const bam_pileup1_t **plp; bam_mplp_t iter; bam_header_t *h = 0; char *ref; void *rghash = 0; bcf_callaux_t *bca = 0; bcf_callret1_t *bcr = 0; bcf_call_t bc; bcf_t *bp = 0; bcf_hdr_t *bh = 0; bam_sample_t *sm = 0; kstring_t buf; mplp_pileup_t gplp; memset(&gplp, 0, sizeof(mplp_pileup_t)); memset(&buf, 0, sizeof(kstring_t)); memset(&bc, 0, sizeof(bcf_call_t)); data = calloc(n, sizeof(void*)); plp = calloc(n, sizeof(void*)); n_plp = calloc(n, sizeof(int*)); sm = bam_smpl_init(); // read the header and initialize data for (i = 0; i < n; ++i) { bam_header_t *h_tmp; data[i] = calloc(1, sizeof(mplp_aux_t)); data[i]->fp = strcmp(fn[i], "-") == 0? bam_dopen(fileno(stdin), "r") : bam_open(fn[i], "r"); data[i]->conf = conf; h_tmp = bam_header_read(data[i]->fp); data[i]->h = i? h : h_tmp; // for i==0, "h" has not been set yet bam_smpl_add(sm, fn[i], (conf->flag&MPLP_IGNORE_RG)? 0 : h_tmp->text); rghash = bcf_call_add_rg(rghash, h_tmp->text, conf->pl_list); if (conf->reg) { int beg, end; bam_index_t *idx; idx = bam_index_load(fn[i]); if (idx == 0) { fprintf(stderr, "[%s] fail to load index for %d-th input.\n", __func__, i+1); exit(1); } if (bam_parse_region(h_tmp, conf->reg, &tid, &beg, &end) < 0) { fprintf(stderr, "[%s] malformatted region or wrong seqname for %d-th input.\n", __func__, i+1); exit(1); } if (i == 0) tid0 = tid, beg0 = beg, end0 = end; data[i]->iter = bam_iter_query(idx, tid, beg, end); bam_index_destroy(idx); } if (i == 0) h = h_tmp; else { // FIXME: to check consistency bam_header_destroy(h_tmp); } } gplp.n = sm->n; gplp.n_plp = calloc(sm->n, sizeof(int)); gplp.m_plp = calloc(sm->n, sizeof(int)); gplp.plp = calloc(sm->n, sizeof(void*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, sm->n, n); // write the VCF header if (conf->flag & MPLP_GLF) { kstring_t s; bh = calloc(1, sizeof(bcf_hdr_t)); s.l = s.m = 0; s.s = 0; bp = bcf_open("-", (conf->flag&MPLP_NO_COMP)? "wu" : "w"); for (i = 0; i < h->n_targets; ++i) { kputs(h->target_name[i], &s); kputc('\0', &s); } bh->l_nm = s.l; bh->name = malloc(s.l); memcpy(bh->name, s.s, s.l); s.l = 0; for (i = 0; i < sm->n; ++i) { kputs(sm->smpl[i], &s); kputc('\0', &s); } bh->l_smpl = s.l; bh->sname = malloc(s.l); memcpy(bh->sname, s.s, s.l); bh->txt = malloc(strlen(BAM_VERSION) + 64); bh->l_txt = 1 + sprintf(bh->txt, "##samtoolsVersion=%s\n", BAM_VERSION); free(s.s); bcf_hdr_sync(bh); bcf_hdr_write(bp, bh); bca = bcf_call_init(-1., conf->min_baseQ); bcr = calloc(sm->n, sizeof(bcf_callret1_t)); bca->rghash = rghash; bca->openQ = conf->openQ, bca->extQ = conf->extQ, bca->tandemQ = conf->tandemQ; bca->min_frac = conf->min_frac; bca->min_support = conf->min_support; } if (tid0 >= 0 && conf->fai) { // region is set ref = faidx_fetch_seq(conf->fai, h->target_name[tid0], 0, 0x7fffffff, &ref_len); ref_tid = tid0; for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid0; } else ref_tid = -1, ref = 0; iter = bam_mplp_init(n, mplp_func, (void**)data); max_depth = conf->max_depth; if (max_depth * sm->n > 1<<20) fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__); if (max_depth * sm->n < 8000) { max_depth = 8000 / sm->n; fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth); } max_indel_depth = conf->max_indel_depth * sm->n; bam_mplp_set_maxcnt(iter, max_depth); while (bam_mplp_auto(iter, &tid, &pos, n_plp, plp) > 0) { if (conf->reg && (pos < beg0 || pos >= end0)) continue; // out of the region requested if (conf->bed && tid >= 0 && !bed_overlap(conf->bed, h->target_name[tid], pos, pos+1)) continue; if (tid != ref_tid) { free(ref); ref = 0; if (conf->fai) ref = faidx_fetch_seq(conf->fai, h->target_name[tid], 0, 0x7fffffff, &ref_len); for (i = 0; i < n; ++i) data[i]->ref = ref, data[i]->ref_id = tid; ref_tid = tid; } if (conf->flag & MPLP_GLF) { int total_depth, _ref0, ref16; bcf1_t *b = calloc(1, sizeof(bcf1_t)); for (i = total_depth = 0; i < n; ++i) total_depth += n_plp[i]; group_smpl(&gplp, sm, &buf, n, fn, n_plp, plp, conf->flag & MPLP_IGNORE_RG); _ref0 = (ref && pos < ref_len)? ref[pos] : 'N'; ref16 = bam_nt16_table[_ref0]; for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], ref16, bca, bcr + i); bcf_call_combine(gplp.n, bcr, ref16, &bc); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), 0, 0); bcf_write(bp, bh, b); bcf_destroy(b); // call indels if (!(conf->flag&MPLP_NO_INDEL) && total_depth < max_indel_depth && bcf_call_gap_prep(gplp.n, gplp.n_plp, gplp.plp, pos, bca, ref, rghash) >= 0) { for (i = 0; i < gplp.n; ++i) bcf_call_glfgen(gplp.n_plp[i], gplp.plp[i], -1, bca, bcr + i); if (bcf_call_combine(gplp.n, bcr, -1, &bc) >= 0) { b = calloc(1, sizeof(bcf1_t)); bcf_call2bcf(tid, pos, &bc, b, (conf->flag&(MPLP_FMT_DP|MPLP_FMT_SP))? bcr : 0, (conf->flag&MPLP_FMT_SP), bca, ref); bcf_write(bp, bh, b); bcf_destroy(b); } } } else { printf("%s\t%d\t%c", h->target_name[tid], pos + 1, (ref && pos < ref_len)? ref[pos] : 'N'); for (i = 0; i < n; ++i) { int j; printf("\t%d\t", n_plp[i]); if (n_plp[i] == 0) { printf("*\t*"); // FIXME: printf() is very slow... if (conf->flag & MPLP_PRINT_POS) printf("\t*"); } else { for (j = 0; j < n_plp[i]; ++j) pileup_seq(plp[i] + j, pos, ref_len, ref); putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; int c = bam1_qual(p->b)[p->qpos] + 33; if (c > 126) c = 126; putchar(c); } if (conf->flag & MPLP_PRINT_MAPQ) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { int c = plp[i][j].b->core.qual + 33; if (c > 126) c = 126; putchar(c); } } if (conf->flag & MPLP_PRINT_POS) { putchar('\t'); for (j = 0; j < n_plp[i]; ++j) { if (j > 0) putchar(','); printf("%d", plp[i][j].qpos + 1); // FIXME: printf() is very slow... } } } } putchar('\n'); } } bcf_close(bp); bam_smpl_destroy(sm); free(buf.s); for (i = 0; i < gplp.n; ++i) free(gplp.plp[i]); free(gplp.plp); free(gplp.n_plp); free(gplp.m_plp); bcf_call_del_rghash(rghash); bcf_hdr_destroy(bh); bcf_call_destroy(bca); free(bc.PL); free(bcr); bam_mplp_destroy(iter); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(plp); free(ref); free(n_plp); return 0; }
static void reheader_bcf(args_t *args, int is_compressed) { htsFile *fp = hts_open(args->fname, "r"); if ( !fp ) error("Failed to open: %s\n", args->fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); kstring_t htxt = {0,0,0}; int hlen; htxt.s = bcf_hdr_fmt_text(hdr, 1, &hlen); htxt.l = hlen; int i, nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0; read_header_file(args->header_fname, &htxt); } if ( samples ) { set_samples(samples, nsamples, &htxt); for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } bcf_hdr_t *hdr_out = bcf_hdr_init("r"); bcf_hdr_parse(hdr_out, htxt.s); if ( args->header_fname ) hdr_out = strip_header(hdr, hdr_out); // write the header and the body htsFile *fp_out = hts_open("-",is_compressed ? "wb" : "wbu"); bcf_hdr_write(fp_out, hdr_out); bcf1_t *rec = bcf_init(); while ( bcf_read(fp, hdr, rec)==0 ) { // sanity checking, this slows things down. Make it optional? bcf_unpack(rec, BCF_UN_ALL); if ( rec->rid >= hdr_out->n[BCF_DT_CTG] || strcmp(bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid),bcf_hdr_int2id(hdr_out,BCF_DT_CTG,rec->rid)) ) error("The CHROM is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid)); for (i=0; i<rec->d.n_flt; i++) { int id = rec->d.flt[i]; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FLT,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken FILTER ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->d.n_flt ) error("The FILTER is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.flt[i])); for (i=0; i<rec->n_info; i++) { int id = rec->d.info[i].key; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !hdr_out->id[BCF_DT_ID][id].key ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_INFO,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken INFO ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->n_info ) error("The INFO tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.info[i].key)); for (i=0; i<rec->n_fmt; i++) { int id = rec->d.fmt[i].id; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !hdr_out->id[BCF_DT_ID][id].key ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FMT,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken FORMAT ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->n_fmt ) error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); bcf_write(fp_out,hdr_out,rec); } bcf_destroy(rec); free(htxt.s); hts_close(fp_out); hts_close(fp); bcf_hdr_destroy(hdr_out); bcf_hdr_destroy(hdr); }