static void parse_cmdline_args(int argc, char **argv) { // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq == SIZE_MAX,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len == SIZE_MAX,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len == SIZE_MAX,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'D': cmd_check(max_path_diff == SIZE_MAX, cmd); max_path_diff = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" calls2vcf -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = default_out_path; if(min_mapq == SIZE_MAX) min_mapq = DEFAULT_MIN_MAPQ; if(max_align_len == SIZE_MAX) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len == SIZE_MAX) max_allele_len = DEFAULT_MAX_ALLELE; if(max_path_diff == SIZE_MAX) max_path_diff = DEFAULT_MAX_PDIFF; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); input_path = argv[optind++]; ref_paths = argv + optind; num_ref_paths = argc - optind; }
int ctx_calls2vcf(int argc, char **argv) { const char *in_path = NULL, *out_path = NULL, *out_type = NULL; // Filtering parameters int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1; // Alignment parameters int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1; // ref paths char const*const* ref_paths = NULL; size_t nref_paths = 0; // flank file const char *sam_path = NULL; // // Things we figure out by looking at the input // bool isbubble = false; // samples in VCF, (0 for bubble, does not include ref in breakpoint calls) size_t i, kmer_size, num_samples; // // Reference genome // // Hash map of chromosome name -> sequence ChromHash *genome; ReadBuffer chroms; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len < 0,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(max_align_len < 0) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); in_path = argv[optind++]; ref_paths = (char const*const*)argv + optind; nref_paths = argc - optind; // These functions call die() on error gzFile gzin = futil_gzopen(in_path, "r"); // Read call file header cJSON *json = json_hdr_load(gzin, in_path); // Check we can handle the kmer size kmer_size = json_hdr_get_kmer_size(json, in_path); db_graph_check_kmer_size(kmer_size, in_path); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(in_path), isbubble ? "bubble" : "breakpoint"); if(isbubble) { // bubble specific if(sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ; } else { // breakpoint specific if(min_mapq >= 0) cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls"); } // Open flank file if it exists htsFile *samfh = NULL; bam_hdr_t *bam_hdr = NULL; bam1_t *mflank = NULL; if(sam_path) { if((samfh = hts_open(sam_path, "r")) == NULL) die("Cannot open SAM/BAM %s", sam_path); // Load BAM header bam_hdr = sam_hdr_read(samfh); if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path); mflank = bam_init1(); } // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, in_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path); num_samples = 0; if(!isbubble) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } // // Open output file // if(!out_path) out_path = "-"; int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *vcffh = hts_open(out_path, modes_htslib[mode]); status("[calls2vcf] Reading %s call file with %zu samples", isbubble ? "Bubble" : "Breakpoint", num_graph_samples); status("[calls2vcf] %zu sample output to: %s format: %s", num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]); if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq); status("[calls2vcf] max alignment length: %i", max_align_len); status("[calls2vcf] max VCF allele length: %i", max_allele_len); status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i", nwmatch, nwmismatch, nwgapopen, nwgapextend); // Load reference genome read_buf_alloc(&chroms, 1024); genome = chrom_hash_init(); chrom_hash_load(ref_paths, nref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!isbubble) brkpnt_check_refs_match(json, genome, in_path); bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size, ref_paths, nref_paths, chroms.b, chroms.len); if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header"); AlignedCall *call = acall_init(); CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr); scoring_t *scoring = call_decomp_get_scoring(aligner); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, false, false, 0, 0, 0, 0); CallFileEntry centry; call_file_entry_alloc(¢ry); char kmer_str[50]; sprintf(kmer_str, ";K%zu", kmer_size); if(isbubble) { // Bubble calls DecompBubble *bubbles = decomp_bubble_init(); // Set scoring for aligning 3' flank scoring = decomp_bubble_get_scoring(bubbles); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, true, true, 0, 0, 0, 0); while(call_file_read(gzin, in_path, ¢ry)) { do { if(sam_read1(samfh, bam_hdr, mflank) < 0) die("We've run out of SAM entries!"); } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); // Align call strbuf_reset(&call->info); decomp_bubble_call(bubbles, genome, kmer_size, min_mapq, ¢ry, mflank, bam_hdr, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats)); decomp_bubble_cpy_stats(bub_stats, bubbles); print_bubble_stats(bub_stats); ctx_free(bub_stats); decomp_bubble_destroy(bubbles); } else { // Breakpoint calls DecompBreakpoint *breakpoints = decomp_brkpt_init(); while(call_file_read(gzin, in_path, ¢ry)) { strbuf_reset(&call->info); decomp_brkpt_call(breakpoints, genome, num_samples, ¢ry, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats)); decomp_brkpt_cpy_stats(brk_stats, breakpoints); print_breakpoint_stats(brk_stats); ctx_free(brk_stats); decomp_brkpt_destroy(breakpoints); } // Print stats DecomposeStats *astats = ctx_calloc(1, sizeof(*astats)); call_decomp_cpy_stats(astats, aligner); print_acall_stats(astats); ctx_free(astats); call_file_entry_dealloc(¢ry); call_decomp_destroy(aligner); acall_destroy(call); // Finished - clean up cJSON_Delete(json); gzclose(gzin); bcf_hdr_destroy(vcfhdr); hts_close(vcffh); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); chrom_hash_destroy(genome); if(sam_path) { hts_close(samfh); bam_hdr_destroy(bam_hdr); bam_destroy1(mflank); } return EXIT_SUCCESS; }