static void print_vcf_header(cJSON *json, bool is_breakpoint, FILE *fout) { ctx_assert(json != NULL); char datestr[9]; time_t date = time(NULL); strftime(datestr, 9, "%Y%m%d", localtime(&date)); fprintf(fout, "##fileformat=VCFv4.1\n##fileDate=%s\n", datestr); // Print commands used to generate header cJSON *commands = json_hdr_get(json, "commands", cJSON_Array, input_path); cJSON *command = commands->child; // Print this command char keystr[8]; char *prevstr = NULL; size_t i; if(command) { cJSON *key = json_hdr_get(command, "key", cJSON_String, input_path); prevstr = key->valuestring; } // Print command entry for this command fprintf(fout, "##mccortex_%s=<prev=\"%s\",cmd=\"%s\",cwd=\"%s\",version="CTX_VERSION">\n", hex_rand_str(keystr, sizeof(keystr)), prevstr ? prevstr : "NULL", cmd_get_cmdline(), cmd_get_cwd()); // Print previous commands for(; command != NULL; command = command->next) { cJSON *key = json_hdr_get(command, "key", cJSON_String, input_path); cJSON *cmd = json_hdr_get(command, "cmd", cJSON_Array, input_path); cJSON *cwd = json_hdr_get(command, "cwd", cJSON_String, input_path); cJSON *prev = json_hdr_get(command, "prev", cJSON_Array, input_path); cJSON *ver = json_hdr_try(command, "mccortex",cJSON_String, input_path); prev = prev->child; // result could be NULL if(prev && prev->type != cJSON_String) die("Invalid 'prev' field"); fprintf(fout, "##mccortex_%s=<prev=\"%s", key->valuestring, prev ? prev->valuestring : "NULL"); if(prev) { while((prev = prev->next) != NULL) fprintf(fout, ";%s", prev->valuestring); } fprintf(fout, "\",cmd=\""); for(i = 0, cmd = cmd->child; cmd; cmd = cmd->next, i++) { if(i > 0) fputc(' ', fout); fputs(cmd->valuestring, fout); } fprintf(fout, "\",cwd=\"%s\"", cwd->valuestring); if(ver) { fprintf(fout, ",version=\"%s\"", ver->valuestring); } fprintf(fout, ">\n"); } // Print field definitions if(is_breakpoint) fprintf(fout, "##INFO=<ID=BRKPNT,Number=1,Type=String,Description=\"Breakpoint call\">\n"); else fprintf(fout, "##INFO=<ID=BUBBLE,Number=1,Type=String,Description=\"Bubble call\">\n"); fprintf(fout, "##INFO=<ID=K%zu,Number=0,Type=Flag,Description=\"Found at k=%zu\">\n", kmer_size, kmer_size); fprintf(fout, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"); fprintf(fout, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n"); // Print reference paths fprintf(fout, "##reference=%s", ref_paths[0]); for(i = 1; i < num_ref_paths; i++) printf(",%s", ref_paths[i]); fprintf(fout, "\n"); // Print contigs lengths for(i = 0; i < chroms.len; i++) { fprintf(fout, "##contig=<ID=%s,length=%zu>\n", chroms.b[i].name.b, chroms.b[i].seq.end); } // Print VCF column header fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", fout); if(is_breakpoint) { // Print a column for each sample cJSON *graph_json = json_hdr_get(json, "graph", cJSON_Object, input_path); cJSON *colours_json = json_hdr_get(graph_json, "colours", cJSON_Array, input_path); cJSON *colour_json = colours_json->child; if(colour_json == NULL) die("Missing colours"); for(; colour_json; colour_json = colour_json->next) { if(!json_hdr_colour_is_ref(colour_json)) { cJSON *sample_json = json_hdr_get(colour_json, "sample", cJSON_String, input_path); fputc('\t', fout); fputs(sample_json->valuestring, fout); } } } fputc('\n', fout); }
static bcf_hdr_t* make_vcf_hdr(cJSON *json, const char *in_path, bool is_breakpoint, size_t kmer_size, char const*const* ref_paths, size_t nref_paths, read_t *chroms, size_t nchroms) { ctx_assert(json != NULL); StrBuf hdrbuf; strbuf_alloc(&hdrbuf, 1024); char datestr[9]; time_t date = time(NULL); strftime(datestr, 9, "%Y%m%d", localtime(&date)); strbuf_append_str(&hdrbuf, "##fileformat=VCFv4.2\n##fileDate="); strbuf_append_str(&hdrbuf, datestr); strbuf_append_str(&hdrbuf, "\n"); // Print commands used to generate header cJSON *commands = json_hdr_get(json, "commands", cJSON_Array, in_path); cJSON *command = commands->child; // Print this command char keystr[8]; char *prevstr = NULL; size_t i; if(command) { cJSON *key = json_hdr_get(command, "key", cJSON_String, in_path); prevstr = key->valuestring; } // Print command entry for this command strbuf_append_str(&hdrbuf, "##mccortex_"); strbuf_append_str(&hdrbuf, hex_rand_str(keystr, sizeof(keystr))); strbuf_append_str(&hdrbuf, "=<prev=\""); strbuf_append_str(&hdrbuf, prevstr ? prevstr : "NULL"); strbuf_append_str(&hdrbuf, "\",cmd=\""); strbuf_append_str(&hdrbuf, cmd_get_cmdline()); strbuf_append_str(&hdrbuf, "\",cwd=\""); strbuf_append_str(&hdrbuf, cmd_get_cwd()); strbuf_append_str(&hdrbuf, "\",version="CTX_VERSION">\n"); // Print previous commands vcf_hdrtxt_append_commands(command, &hdrbuf, in_path); // Print field definitions if(is_breakpoint) strbuf_append_str(&hdrbuf, "##INFO=<ID=BRKPNT,Number=1,Type=String,Description=\"Breakpoint call\">\n"); else strbuf_append_str(&hdrbuf, "##INFO=<ID=BUBBLE,Number=1,Type=String,Description=\"Bubble call\">\n"); strbuf_sprintf(&hdrbuf, "##INFO=<ID=K%zu,Number=0,Type=Flag,Description=\"Found at k=%zu\">\n", kmer_size, kmer_size); strbuf_append_str(&hdrbuf, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"); strbuf_append_str(&hdrbuf, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n"); // Print reference paths strbuf_append_str(&hdrbuf, "##reference="); strbuf_append_str(&hdrbuf, ref_paths[0]); for(i = 1; i < nref_paths; i++) { strbuf_append_char(&hdrbuf, ','); strbuf_append_str(&hdrbuf, ref_paths[i]); } strbuf_append_str(&hdrbuf, "\n"); // Print contigs lengths for(i = 0; i < nchroms; i++) { strbuf_sprintf(&hdrbuf, "##contig=<ID=%s,length=%zu>\n", chroms[i].name.b, chroms[i].seq.end); } // Print VCF column header strbuf_append_str(&hdrbuf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); if(is_breakpoint) { // Print a column for each sample cJSON *graph_json = json_hdr_get(json, "graph", cJSON_Object, in_path); cJSON *colours_json = json_hdr_get(graph_json, "colours", cJSON_Array, in_path); cJSON *colour_json = colours_json->child; if(colour_json == NULL) die("Missing colours"); for(; colour_json; colour_json = colour_json->next) { if(!json_hdr_colour_is_ref(colour_json)) { cJSON *sample_json = json_hdr_get(colour_json, "sample", cJSON_String, in_path); strbuf_append_str(&hdrbuf, "\t"); strbuf_append_str(&hdrbuf, sample_json->valuestring); } } } strbuf_append_char(&hdrbuf, '\n'); bcf_hdr_t *hdr = bcf_hdr_init("w"); if(bcf_hdr_parse(hdr, hdrbuf.b) != 0) die("Cannot construct VCF header"); strbuf_dealloc(&hdrbuf); return hdr; }
int ctx_vcfcov(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL, *out_type = NULL; uint32_t max_allele_len = 0, max_gt_vars = 0; char *ref_path = NULL; bool low_mem = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; size_t i; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'r': cmd_check(!ref_path, cmd); ref_path = optarg; break; case 'L': cmd_check(!max_allele_len,cmd); max_allele_len = cmd_uint32(cmd,optarg); break; case 'N': cmd_check(!max_gt_vars,cmd); max_gt_vars = cmd_uint32(cmd,optarg); break; case 'M': cmd_check(!low_mem, cmd); low_mem = true; break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(ref_path == NULL) cmd_print_usage("Require a reference (-r,--ref <ref.fa>)"); if(optind+2 > argc) cmd_print_usage("Require VCF and graph files"); if(!max_allele_len) max_allele_len = DEFAULT_MAX_ALLELE_LEN; if(!max_gt_vars) max_gt_vars = DEFAULT_MAX_GT_VARS; status("[vcfcov] max allele length: %u; max number of variants: %u", max_allele_len, max_gt_vars); // open ref // index fasta with: samtools faidx ref.fa faidx_t *fai = fai_load(ref_path); if(fai == NULL) die("Cannot load ref index: %s / %s.fai", ref_path, ref_path); // Open input VCF file const char *vcf_path = argv[optind++]; htsFile *vcffh = hts_open(vcf_path, "r"); if(vcffh == NULL) die("Cannot open VCF file: %s", vcf_path); bcf_hdr_t *vcfhdr = bcf_hdr_read(vcffh); if(vcfhdr == NULL) die("Cannot read VCF header: %s", vcf_path); // Test we can close and reopen files if(low_mem) { if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); } // // Open graph files // const size_t num_gfiles = argc - optind; char **graph_paths = argv + optind; ctx_assert(num_gfiles > 0); GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); size_t ncols, ctx_max_kmers = 0, ctx_sum_kmers = 0; ncols = graph_files_open(graph_paths, gfiles, num_gfiles, &ctx_max_kmers, &ctx_sum_kmers); // Check graph + paths are compatible graphs_gpaths_compatible(gfiles, num_gfiles, NULL, 0, -1); // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + sizeof(Covg)*8 * ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, low_mem ? -1 : (int64_t)ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // // Open output file // // v=>vcf, z=>compressed vcf, b=>bcf, bu=>uncompressed bcf int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *outfh = hts_open(out_path, modes_htslib[mode]); status("[vcfcov] Output format: %s", hsmodes_htslib[mode]); // Allocate memory dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, ncols, 1, kmers_in_hash, DBG_ALLOC_COVGS); // // Set up tag names // // *R => ref, *A => alt sprintf(kcov_ref_tag, "K%zuR", db_graph.kmer_size); // mean coverage sprintf(kcov_alt_tag, "K%zuA", db_graph.kmer_size); // #SAMPLE=<ID=...,K29KCOV=...,K29NK=...,K29RLK> // - K29_kcov is empirical kmer coverage // - K29_nkmers is the number of kmers in the sample // - mean_read_length is the mean read length in bases char sample_kcov_tag[20], sample_nk_tag[20], sample_rlk_tag[20]; sprintf(sample_kcov_tag, "K%zu_kcov", db_graph.kmer_size); // mean coverage sprintf(sample_nk_tag, "K%zu_nkmers", db_graph.kmer_size); sprintf(sample_rlk_tag, "mean_read_length"); // // Load kmers if we are using --low-mem // VcfCovStats st; memset(&st, 0, sizeof(st)); VcfCovPrefs prefs = {.kcov_ref_tag = kcov_ref_tag, .kcov_alt_tag = kcov_alt_tag, .max_allele_len = max_allele_len, .max_gt_vars = max_gt_vars, .load_kmers_only = false}; if(low_mem) { status("[vcfcov] Loading kmers from VCF+ref"); prefs.load_kmers_only = true; vcfcov_file(vcffh, vcfhdr, NULL, NULL, vcf_path, fai, NULL, &prefs, &st, &db_graph); // Close files hts_close(vcffh); bcf_hdr_destroy(vcfhdr); // Re-open files if((vcffh = hts_open(vcf_path, "r")) == NULL) die("Cannot re-open VCF file: %s", vcf_path); if((vcfhdr = bcf_hdr_read(vcffh)) == NULL) die("Cannot re-read VCF header: %s", vcf_path); prefs.load_kmers_only = false; } // // Load graphs // GraphLoadingStats gstats; memset(&gstats, 0, sizeof(gstats)); GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.must_exist_in_graph = low_mem; for(i = 0; i < num_gfiles; i++) { graph_load(&gfiles[i], gprefs, &gstats); graph_file_close(&gfiles[i]); } ctx_free(gfiles); hash_table_print_stats(&db_graph.ht); // // Set up VCF header / graph matchup // size_t *samplehdrids = ctx_malloc(db_graph.num_of_cols * sizeof(size_t)); // Add samples to vcf header bcf_hdr_t *outhdr = bcf_hdr_dup(vcfhdr); bcf_hrec_t *hrec; int sid; char hdrstr[200]; for(i = 0; i < db_graph.num_of_cols; i++) { char *sname = db_graph.ginfo[i].sample_name.b; if((sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname)) < 0) { bcf_hdr_add_sample(outhdr, sname); sid = bcf_hdr_id2int(outhdr, BCF_DT_SAMPLE, sname); } samplehdrids[i] = sid; // Add SAMPLE field hrec = bcf_hdr_get_hrec(outhdr, BCF_HL_STR, "ID", sname, "SAMPLE"); if(hrec == NULL) { sprintf(hdrstr, "##SAMPLE=<ID=%s,%s=%"PRIu64",%s=%"PRIu64",%s=%zu>", sname, sample_kcov_tag, gstats.nkmers[i] ? gstats.sumcov[i] / gstats.nkmers[i] : 0, sample_nk_tag, gstats.nkmers[i], sample_rlk_tag, (size_t)db_graph.ginfo[i].mean_read_length); bcf_hdr_append(outhdr, hdrstr); } else { // mean kcovg sprintf(hdrstr, "%"PRIu64, gstats.sumcov[i] / gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_kcov_tag, hdrstr); // num kmers sprintf(hdrstr, "%"PRIu64, gstats.nkmers[i]); vcf_misc_add_update_hrec(hrec, sample_nk_tag, hdrstr); // mean read length in kmers sprintf(hdrstr, "%zu", (size_t)db_graph.ginfo[i].mean_read_length); vcf_misc_add_update_hrec(hrec, sample_rlk_tag, hdrstr); } status("[vcfcov] Colour %zu: %s [VCF column %zu]", i, sname, samplehdrids[i]); } // Add genotype format fields // One field per alternative allele sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on ref (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_ref_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); sprintf(hdrstr, "##FORMAT=<ID=%s,Number=A,Type=Integer," "Description=\"Coverage on alt (k=%zu): sum(kmer_covs) / exp_num_kmers\">\n", kcov_alt_tag, db_graph.kmer_size); bcf_hdr_append(outhdr, hdrstr); bcf_hdr_set_version(outhdr, "VCFv4.2"); // Add command string to header vcf_misc_hdr_add_cmd(outhdr, cmd_get_cmdline(), cmd_get_cwd()); if(bcf_hdr_write(outfh, outhdr) != 0) die("Cannot write header to: %s", futil_outpath_str(out_path)); status("[vcfcov] Reading %s and adding coverage", vcf_path); // Reset stats and get coverage memset(&st, 0, sizeof(st)); vcfcov_file(vcffh, vcfhdr, outfh, outhdr, vcf_path, fai, samplehdrids, &prefs, &st, &db_graph); // Print statistics char ns0[50], ns1[50]; status("[vcfcov] Read %s VCF lines", ulong_to_str(st.nvcf_lines, ns0)); status("[vcfcov] Read %s ALTs", ulong_to_str(st.nalts_read, ns0)); status("[vcfcov] Used %s kmers", ulong_to_str(st.ngt_kmers, ns0)); status("[vcfcov] ALTs used: %s / %s (%.2f%%)", ulong_to_str(st.nalts_loaded, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_loaded) / st.nalts_read : 0.0); status("[vcfcov] ALTs too long (>%ubp): %s / %s (%.2f%%)", max_allele_len, ulong_to_str(st.nalts_too_long, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_too_long) / st.nalts_read : 0.0); status("[vcfcov] ALTs too dense (>%u within %zubp): %s / %s (%.2f%%)", max_gt_vars, db_graph.kmer_size, ulong_to_str(st.nalts_no_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_no_covg) / st.nalts_read : 0.0); status("[vcfcov] ALTs printed with coverage: %s / %s (%.2f%%)", ulong_to_str(st.nalts_with_covg, ns0), ulong_to_str(st.nalts_read, ns1), st.nalts_read ? (100.0*st.nalts_with_covg) / st.nalts_read : 0.0); status("[vcfcov] Saved to: %s\n", out_path); ctx_free(samplehdrids); graph_loading_stats_destroy(&gstats); bcf_hdr_destroy(vcfhdr); bcf_hdr_destroy(outhdr); hts_close(vcffh); hts_close(outfh); fai_destroy(fai); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }