int ctx_calls2vcf(int argc, char **argv) { const char *in_path = NULL, *out_path = NULL, *out_type = NULL; // Filtering parameters int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1; // Alignment parameters int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1; // ref paths char const*const* ref_paths = NULL; size_t nref_paths = 0; // flank file const char *sam_path = NULL; // // Things we figure out by looking at the input // bool isbubble = false; // samples in VCF, (0 for bubble, does not include ref in breakpoint calls) size_t i, kmer_size, num_samples; // // Reference genome // // Hash map of chromosome name -> sequence ChromHash *genome; ReadBuffer chroms; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len < 0,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(max_align_len < 0) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); in_path = argv[optind++]; ref_paths = (char const*const*)argv + optind; nref_paths = argc - optind; // These functions call die() on error gzFile gzin = futil_gzopen(in_path, "r"); // Read call file header cJSON *json = json_hdr_load(gzin, in_path); // Check we can handle the kmer size kmer_size = json_hdr_get_kmer_size(json, in_path); db_graph_check_kmer_size(kmer_size, in_path); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(in_path), isbubble ? "bubble" : "breakpoint"); if(isbubble) { // bubble specific if(sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ; } else { // breakpoint specific if(min_mapq >= 0) cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls"); } // Open flank file if it exists htsFile *samfh = NULL; bam_hdr_t *bam_hdr = NULL; bam1_t *mflank = NULL; if(sam_path) { if((samfh = hts_open(sam_path, "r")) == NULL) die("Cannot open SAM/BAM %s", sam_path); // Load BAM header bam_hdr = sam_hdr_read(samfh); if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path); mflank = bam_init1(); } // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, in_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path); num_samples = 0; if(!isbubble) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } // // Open output file // if(!out_path) out_path = "-"; int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *vcffh = hts_open(out_path, modes_htslib[mode]); status("[calls2vcf] Reading %s call file with %zu samples", isbubble ? "Bubble" : "Breakpoint", num_graph_samples); status("[calls2vcf] %zu sample output to: %s format: %s", num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]); if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq); status("[calls2vcf] max alignment length: %i", max_align_len); status("[calls2vcf] max VCF allele length: %i", max_allele_len); status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i", nwmatch, nwmismatch, nwgapopen, nwgapextend); // Load reference genome read_buf_alloc(&chroms, 1024); genome = chrom_hash_init(); chrom_hash_load(ref_paths, nref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!isbubble) brkpnt_check_refs_match(json, genome, in_path); bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size, ref_paths, nref_paths, chroms.b, chroms.len); if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header"); AlignedCall *call = acall_init(); CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr); scoring_t *scoring = call_decomp_get_scoring(aligner); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, false, false, 0, 0, 0, 0); CallFileEntry centry; call_file_entry_alloc(¢ry); char kmer_str[50]; sprintf(kmer_str, ";K%zu", kmer_size); if(isbubble) { // Bubble calls DecompBubble *bubbles = decomp_bubble_init(); // Set scoring for aligning 3' flank scoring = decomp_bubble_get_scoring(bubbles); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, true, true, 0, 0, 0, 0); while(call_file_read(gzin, in_path, ¢ry)) { do { if(sam_read1(samfh, bam_hdr, mflank) < 0) die("We've run out of SAM entries!"); } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); // Align call strbuf_reset(&call->info); decomp_bubble_call(bubbles, genome, kmer_size, min_mapq, ¢ry, mflank, bam_hdr, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats)); decomp_bubble_cpy_stats(bub_stats, bubbles); print_bubble_stats(bub_stats); ctx_free(bub_stats); decomp_bubble_destroy(bubbles); } else { // Breakpoint calls DecompBreakpoint *breakpoints = decomp_brkpt_init(); while(call_file_read(gzin, in_path, ¢ry)) { strbuf_reset(&call->info); decomp_brkpt_call(breakpoints, genome, num_samples, ¢ry, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats)); decomp_brkpt_cpy_stats(brk_stats, breakpoints); print_breakpoint_stats(brk_stats); ctx_free(brk_stats); decomp_brkpt_destroy(breakpoints); } // Print stats DecomposeStats *astats = ctx_calloc(1, sizeof(*astats)); call_decomp_cpy_stats(astats, aligner); print_acall_stats(astats); ctx_free(astats); call_file_entry_dealloc(¢ry); call_decomp_destroy(aligner); acall_destroy(call); // Finished - clean up cJSON_Delete(json); gzclose(gzin); bcf_hdr_destroy(vcfhdr); hts_close(vcffh); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); chrom_hash_destroy(genome); if(sam_path) { hts_close(samfh); bam_hdr_destroy(bam_hdr); bam_destroy1(mflank); } return EXIT_SUCCESS; }
int ctx_calls2vcf(int argc, char **argv) { parse_cmdline_args(argc, argv); size_t i; // These functions call die() on error gzFile gzin = futil_gzopen(input_path, "r"); nw_aligner_setup(); // Read file header cJSON *json = read_input_header(gzin); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, input_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) input_bubble_format = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) input_bubble_format = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(input_path), input_bubble_format ? "bubble" : "breakpoint"); if(input_bubble_format && sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); // Open flank file if it exists if(sam_path) flanks_sam_open(); // Open output file FILE *fout = futil_fopen_create(out_path, "w"); // Load reference genome read_buf_alloc(&chroms, 1024); genome = kh_init(ChromHash); seq_reader_load_ref_genome(ref_paths, num_ref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!input_bubble_format) brkpnt_check_refs_match(json, input_path); // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, input_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, input_path); num_samples = 0; if(!input_bubble_format) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } print_vcf_header(json, !input_bubble_format, fout); status("Reading %s call file with %zu samples", input_bubble_format ? "Bubble" : "Breakpoint", num_graph_samples); status("Writing a VCF with %zu samples", num_samples); parse_entries(gzin, fout); // Print stats char num_entries_read_str[50]; char num_vars_printed_str[50]; ulong_to_str(num_entries_read, num_entries_read_str); ulong_to_str(num_vars_printed, num_vars_printed_str); status("Read %s entries, printed %s vcf entries to: %s", num_entries_read_str, num_vars_printed_str, futil_outpath_str(out_path)); if(input_bubble_format) { char msg[200]; // Bubble caller specific print_stat(num_flank5p_unmapped, num_entries_read, "flank 5p unmapped"); sprintf(msg, "flank 5p low mapq (<%zu)", min_mapq); print_stat(num_flank5p_lowqual, num_entries_read, msg); print_stat(num_flank3p_not_found, num_entries_read, "flank 3p not found"); print_stat(num_flank3p_multihits, num_entries_read, "flank 3p multiple hits"); print_stat(num_flank3p_approx_match,num_entries_read, "flank 3p approx match used"); print_stat(num_flank3p_exact_match, num_entries_read, "flank 3p exact match"); } else { // Breakpoint caller specific print_stat(num_flanks_not_uniquely_mapped, num_entries_read, "flank pairs contain one flank not mapped uniquely"); print_stat(num_flanks_diff_chroms, num_entries_read, "flank pairs map to diff chroms"); print_stat(num_flanks_diff_strands, num_entries_read, "flank pairs map to diff strands"); } print_stat(num_flanks_too_far_apart, num_entries_read, "flank pairs too far apart"); print_stat(num_flanks_overlap_too_large, num_entries_read, "flank pairs overlap too much"); print_stat(num_entries_well_mapped, num_entries_read, "flank pairs map well"); status("Aligned %zu allele pairs and %zu flanks", num_nw_allele, num_nw_flank); // Finished - clean up cJSON_Delete(json); gzclose(gzin); fclose(fout); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); kh_destroy_ChromHash(genome); nw_aligner_destroy(); if(sam_path) flanks_sam_close(); // hide unused method warnings (void)kh_del_ChromHash; (void)kh_put_ChromHash; (void)kh_get_ChromHash; (void)kh_clear_ChromHash; (void)kh_destroy_ChromHash; (void)kh_init_ChromHash; return EXIT_SUCCESS; }