// Find/add and then update a header record void vcf_misc_add_update_hrec(bcf_hrec_t *hrec, char *key, char *val) { int keyidx = bcf_hrec_find_key(hrec, key); if(keyidx < 0) { status("Adding sample key [%s] => [%s]", key, val); bcf_hrec_add_key(hrec, key, strlen(key)); keyidx = hrec->nkeys-1; } bcf_hrec_set_val(hrec, keyidx, val, strlen(val), 0); // 0 => not quoted }
bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, char *line, int *len) { char *p = line; if (p[0] != '#' || p[1] != '#') { *len = 0; return NULL; } p += 2; char *q = p; while ( *q && *q!='=' ) q++; int n = q-p; if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format bcf_hrec_t *hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = (char*) malloc(sizeof(char)*(n+1)); memcpy(hrec->key,p,n); hrec->key[n] = 0; p = ++q; if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579 { while ( *q && *q!='\n' ) q++; hrec->value = (char*) malloc((q-p+1)*sizeof(char)); memcpy(hrec->value, p, q-p); hrec->value[q-p] = 0; *len = q-line+1; return hrec; } // structured line, e.g. ##INFO=<ID=PV1,Number=1,Type=Float,Description="P-value for baseQ bias"> while ( *q && *q!='\n' ) { p = ++q; while ( *q && *q!='=' ) q++; n = q-p; if ( *q!='=' || !n ) { *len = q-line+1; return NULL; } // wrong format bcf_hrec_add_key(hrec, p, q-p); p = ++q; int quoted = *p=='"' ? 1 : 0; if ( quoted ) p++, q++; while (1) { if ( !*q ) break; if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; } else { if ( *q==',' || *q=='>' ) break; } q++; } bcf_hrec_set_val(hrec, hrec->nkeys-1, p, q-p, quoted); if ( quoted ) q++; if ( *q=='>' ) { q++; break; } } *len = q-line+1; return hrec; }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }