bcf_hdr_t *bcf_hdr_read(BGZF *fp) { uint8_t magic[5]; bcf_hdr_t *h; h = bcf_hdr_init(); bgzf_read(fp, magic, 5); if (strncmp((char*)magic, "BCF\2\1", 5) != 0) { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] invalid BCF2 magic string\n", __func__); bcf_hdr_destroy(h); return 0; } bgzf_read(fp, &h->l_text, 4); h->text = (char*)malloc(h->l_text); bgzf_read(fp, h->text, h->l_text); bcf_hdr_parse(h); return h; }
static bcf_hdr_t* make_vcf_hdr(cJSON *json, const char *in_path, bool is_breakpoint, size_t kmer_size, char const*const* ref_paths, size_t nref_paths, read_t *chroms, size_t nchroms) { ctx_assert(json != NULL); StrBuf hdrbuf; strbuf_alloc(&hdrbuf, 1024); char datestr[9]; time_t date = time(NULL); strftime(datestr, 9, "%Y%m%d", localtime(&date)); strbuf_append_str(&hdrbuf, "##fileformat=VCFv4.2\n##fileDate="); strbuf_append_str(&hdrbuf, datestr); strbuf_append_str(&hdrbuf, "\n"); // Print commands used to generate header cJSON *commands = json_hdr_get(json, "commands", cJSON_Array, in_path); cJSON *command = commands->child; // Print this command char keystr[8]; char *prevstr = NULL; size_t i; if(command) { cJSON *key = json_hdr_get(command, "key", cJSON_String, in_path); prevstr = key->valuestring; } // Print command entry for this command strbuf_append_str(&hdrbuf, "##mccortex_"); strbuf_append_str(&hdrbuf, hex_rand_str(keystr, sizeof(keystr))); strbuf_append_str(&hdrbuf, "=<prev=\""); strbuf_append_str(&hdrbuf, prevstr ? prevstr : "NULL"); strbuf_append_str(&hdrbuf, "\",cmd=\""); strbuf_append_str(&hdrbuf, cmd_get_cmdline()); strbuf_append_str(&hdrbuf, "\",cwd=\""); strbuf_append_str(&hdrbuf, cmd_get_cwd()); strbuf_append_str(&hdrbuf, "\",version="CTX_VERSION">\n"); // Print previous commands vcf_hdrtxt_append_commands(command, &hdrbuf, in_path); // Print field definitions if(is_breakpoint) strbuf_append_str(&hdrbuf, "##INFO=<ID=BRKPNT,Number=1,Type=String,Description=\"Breakpoint call\">\n"); else strbuf_append_str(&hdrbuf, "##INFO=<ID=BUBBLE,Number=1,Type=String,Description=\"Bubble call\">\n"); strbuf_sprintf(&hdrbuf, "##INFO=<ID=K%zu,Number=0,Type=Flag,Description=\"Found at k=%zu\">\n", kmer_size, kmer_size); strbuf_append_str(&hdrbuf, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"); strbuf_append_str(&hdrbuf, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n"); // Print reference paths strbuf_append_str(&hdrbuf, "##reference="); strbuf_append_str(&hdrbuf, ref_paths[0]); for(i = 1; i < nref_paths; i++) { strbuf_append_char(&hdrbuf, ','); strbuf_append_str(&hdrbuf, ref_paths[i]); } strbuf_append_str(&hdrbuf, "\n"); // Print contigs lengths for(i = 0; i < nchroms; i++) { strbuf_sprintf(&hdrbuf, "##contig=<ID=%s,length=%zu>\n", chroms[i].name.b, chroms[i].seq.end); } // Print VCF column header strbuf_append_str(&hdrbuf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); if(is_breakpoint) { // Print a column for each sample cJSON *graph_json = json_hdr_get(json, "graph", cJSON_Object, in_path); cJSON *colours_json = json_hdr_get(graph_json, "colours", cJSON_Array, in_path); cJSON *colour_json = colours_json->child; if(colour_json == NULL) die("Missing colours"); for(; colour_json; colour_json = colour_json->next) { if(!json_hdr_colour_is_ref(colour_json)) { cJSON *sample_json = json_hdr_get(colour_json, "sample", cJSON_String, in_path); strbuf_append_str(&hdrbuf, "\t"); strbuf_append_str(&hdrbuf, sample_json->valuestring); } } } strbuf_append_char(&hdrbuf, '\n'); bcf_hdr_t *hdr = bcf_hdr_init("w"); if(bcf_hdr_parse(hdr, hdrbuf.b) != 0) die("Cannot construct VCF header"); strbuf_dealloc(&hdrbuf); return hdr; }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }
static void reheader_bcf(args_t *args, int is_compressed) { htsFile *fp = hts_open(args->fname, "r"); if ( !fp ) error("Failed to open: %s\n", args->fname); bcf_hdr_t *hdr = bcf_hdr_read(fp); if ( !hdr ) error("Failed to read the header: %s\n", args->fname); kstring_t htxt = {0,0,0}; int hlen; htxt.s = bcf_hdr_fmt_text(hdr, 1, &hlen); htxt.l = hlen; int i, nsamples = 0; char **samples = NULL; if ( args->samples_fname ) samples = hts_readlines(args->samples_fname, &nsamples); if ( args->header_fname ) { free(htxt.s); htxt.s = NULL; htxt.l = htxt.m = 0; read_header_file(args->header_fname, &htxt); } if ( samples ) { set_samples(samples, nsamples, &htxt); for (i=0; i<nsamples; i++) free(samples[i]); free(samples); } bcf_hdr_t *hdr_out = bcf_hdr_init("r"); bcf_hdr_parse(hdr_out, htxt.s); if ( args->header_fname ) hdr_out = strip_header(hdr, hdr_out); // write the header and the body htsFile *fp_out = hts_open("-",is_compressed ? "wb" : "wbu"); bcf_hdr_write(fp_out, hdr_out); bcf1_t *rec = bcf_init(); while ( bcf_read(fp, hdr, rec)==0 ) { // sanity checking, this slows things down. Make it optional? bcf_unpack(rec, BCF_UN_ALL); if ( rec->rid >= hdr_out->n[BCF_DT_CTG] || strcmp(bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid),bcf_hdr_int2id(hdr_out,BCF_DT_CTG,rec->rid)) ) error("The CHROM is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_CTG,rec->rid)); for (i=0; i<rec->d.n_flt; i++) { int id = rec->d.flt[i]; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FLT,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken FILTER ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->d.n_flt ) error("The FILTER is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.flt[i])); for (i=0; i<rec->n_info; i++) { int id = rec->d.info[i].key; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !hdr_out->id[BCF_DT_ID][id].key ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_INFO,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken INFO ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->n_info ) error("The INFO tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.info[i].key)); for (i=0; i<rec->n_fmt; i++) { int id = rec->d.fmt[i].id; if ( id >= hdr_out->n[BCF_DT_ID] ) break; if ( !hdr_out->id[BCF_DT_ID][id].key ) break; if ( !bcf_hdr_idinfo_exists(hdr_out,BCF_HL_FMT,id) ) break; if ( strcmp(hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key) ) error("FIXME: Broken FORMAT ids: %s vs %s\n", hdr->id[BCF_DT_ID][id].key,hdr_out->id[BCF_DT_ID][id].key); } if ( i!=rec->n_fmt ) error("The FORMAT tag is not defined: \"%s\"\n", bcf_hdr_int2id(hdr,BCF_DT_ID,rec->d.fmt[i].id)); bcf_write(fp_out,hdr_out,rec); } bcf_destroy(rec); free(htxt.s); hts_close(fp_out); hts_close(fp); bcf_hdr_destroy(hdr_out); bcf_hdr_destroy(hdr); }