static bcf_hdr_t *strip_header(bcf_hdr_t *src, bcf_hdr_t *dst) { bcf_hrec_t *src_hrec, *dst_hrec, *tmp; bcf_hdr_t *out = bcf_hdr_init("r"); int i; for (i=0; i<dst->nhrec; i++) { // first insert lines which do not code BCF ids, their order does not matter dst_hrec = dst->hrec[i]; if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG ) continue; bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec)); } for (i=0; i<src->nhrec; i++) { // now transfer header lines which define BCF ids src_hrec = src->hrec[i]; if ( src_hrec->type==BCF_HL_FLT || src_hrec->type==BCF_HL_INFO || src_hrec->type==BCF_HL_FMT || src_hrec->type== BCF_HL_CTG ) { int j = bcf_hrec_find_key(src_hrec, "ID"); dst_hrec = bcf_hdr_get_hrec(dst, src_hrec->type, "ID", src_hrec->vals[j], NULL); if ( !dst_hrec ) continue; tmp = bcf_hrec_dup(dst_hrec); j = bcf_hrec_find_key(src_hrec, "IDX"); if ( j>=0 ) { j = atoi(src_hrec->vals[j]); hrec_add_idx(tmp, j); } bcf_hdr_add_hrec(out, tmp); } } bcf_hdr_sync(out); for (i=0; i<dst->nhrec; i++) { // finally add new structured fields dst_hrec = dst->hrec[i]; if ( dst_hrec->type==BCF_HL_FLT || dst_hrec->type==BCF_HL_INFO || dst_hrec->type==BCF_HL_FMT || dst_hrec->type== BCF_HL_CTG ) { int j = bcf_hrec_find_key(dst_hrec, "ID"); tmp = bcf_hdr_get_hrec(out, dst_hrec->type, "ID", dst_hrec->vals[j], NULL); if ( !tmp ) bcf_hdr_add_hrec(out, bcf_hrec_dup(dst_hrec)); } } for (i=0; i<dst->n[BCF_DT_SAMPLE]; i++) bcf_hdr_add_sample(out, dst->samples[i]); bcf_hdr_add_sample(out, NULL); bcf_hdr_destroy(dst); return out; }
int bcf_hdr_parse(bcf_hdr_t *hdr) { int len, needs_sync = 0; bcf_hrec_t *hrec; char *p = hdr->text; while ( (hrec=bcf_hdr_parse_line(hdr,p,&len)) ) { // bcf_hrec_debug(hrec); needs_sync += bcf_hdr_add_hrec(hdr, hrec); p += len; } hrec = bcf_hdr_parse_line(hdr,"##FILTER=<ID=PASS,Description=\"All filters passed\">",&len); needs_sync += bcf_hdr_add_hrec(hdr, hrec); bcf_hdr_parse_sample_line(hdr,p); if ( needs_sync ) bcf_hdr_sync(hdr); return 0; }
void bcf_hdr_merge(bcf_hdr_t *hw, const bcf_hdr_t *_hr, const char *clash_prefix) { bcf_hdr_t *hr = (bcf_hdr_t*)_hr; // header lines int i, nw_ori = hw->nhrec; for (i=0; i<hr->nhrec; i++) { if ( hr->hrec[i]->type==BCF_HL_GEN && hr->hrec[i]->value ) { int j; for (j=0; j<nw_ori; j++) { if ( hw->hrec[j]->type!=BCF_HL_GEN ) continue; if ( !strcmp(hr->hrec[i]->key,hw->hrec[j]->key) && !strcmp(hr->hrec[i]->value,hw->hrec[j]->value) ) break; } if ( j>=nw_ori ) bcf_hdr_add_hrec(hw, bcf_hrec_dup(hr->hrec[i])); } else { bcf_hrec_t *rec = bcf_hdr_get_hrec(hw, hr->hrec[i]->type, hr->hrec[i]->vals[0]); if ( !rec ) bcf_hdr_add_hrec(hw, bcf_hrec_dup(hr->hrec[i])); } } // samples for (i=0; i<bcf_hdr_nsamples(hr); i++) { char *name = hr->samples[i]; if ( bcf_hdr_id2int(hw, BCF_DT_SAMPLE, name)!=-1 ) { // there is a sample with the same name int len = strlen(hr->samples[i]) + strlen(clash_prefix) + 1; name = (char*) malloc(sizeof(char)*(len+1)); sprintf(name,"%s:%s",clash_prefix,hr->samples[i]); bcf_hdr_add_sample(hw,name); free(name); } else bcf_hdr_add_sample(hw,name); } }
int bcf_hdr_set(bcf_hdr_t *hdr, const char *fname) { int i, n; char **lines = hts_readlines(fname, &n); if ( !lines ) return 1; for (i=0; i<n-1; i++) { int k; bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,lines[i],&k); bcf_hdr_add_hrec(hdr, hrec); free(lines[i]); } bcf_hdr_parse_sample_line(hdr,lines[n-1]); free(lines[n-1]); free(lines); bcf_hdr_sync(hdr); bcf_hdr_fmt_text(hdr); return 0; }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }