// Check contig entries match reference // We check that these match the reference just loaded static void brkpnt_check_refs_match(cJSON *json, const ChromHash *genome, const char *path) { cJSON *version = json_hdr_get(json, "format_version", cJSON_Number, path); if(version->valueint <= 2) return; cJSON *command = json_hdr_get_curr_cmd(json, path); cJSON *brkpnts = json_hdr_get(command, "breakpoints", cJSON_Object, path); cJSON *contigs = json_hdr_get(brkpnts, "contigs", cJSON_Array, path); cJSON *contig; size_t num_chroms = 0; for(contig = contigs->child; contig; contig = contig->next, num_chroms++) { cJSON *id = json_hdr_get(contig, "id", cJSON_String, path); cJSON *len = json_hdr_get(contig, "length", cJSON_Number, path); // Check chrom is loaded in ref and of expected length khiter_t k = kh_get(kChromHash, genome, id->valuestring); if(k == kh_end(genome)) warn("Cannot find chrom [%s]", id->valuestring); else { const read_t *r = kh_value(genome, k); if(r->seq.end != (size_t)len->valueint) { warn("Chrom lengths do not match %s input:%li ref:%zu", id->valuestring, len->valueint, r->seq.end); } } } if(num_chroms != kh_size(genome)) { warn("Number of chromosomes differ: %zu in header vs %zu in ref", num_chroms, (size_t)kh_size(genome)); } }
void vcf_hdrtxt_append_commands(cJSON *command, StrBuf *hdr, const char *path) { bool first; for(; command != NULL; command = command->next) { cJSON *key = json_hdr_get(command, "key", cJSON_String, path); cJSON *cmd = json_hdr_get(command, "cmd", cJSON_Array, path); cJSON *cwd = json_hdr_get(command, "cwd", cJSON_String, path); cJSON *prev = json_hdr_get(command, "prev", cJSON_Array, path); cJSON *ver = json_hdr_try(command, "mccortex",cJSON_String, path); prev = prev->child; // result could be NULL if(prev && prev->type != cJSON_String) die("Invalid 'prev' field"); strbuf_append_str(hdr, "##mccortex_"); strbuf_append_str(hdr, key->valuestring); strbuf_append_str(hdr, "=<prev=\""); strbuf_append_str(hdr, prev ? prev->valuestring : "NULL"); if(prev) { while((prev = prev->next) != NULL) { strbuf_append_str(hdr, ";"); strbuf_append_str(hdr, prev->valuestring); } } strbuf_append_str(hdr, "\",cmd=\""); for(first = true, cmd = cmd->child; cmd; cmd = cmd->next, first = false) { if(!first) strbuf_append_char(hdr, ' '); strbuf_append_str(hdr, cmd->valuestring); } strbuf_append_str(hdr, "\",cwd=\""); strbuf_append_str(hdr, cwd->valuestring); strbuf_append_str(hdr, "\""); if(ver) { strbuf_append_str(hdr, ",version=\""); strbuf_append_str(hdr, ver->valuestring); strbuf_append_str(hdr, "\""); } strbuf_append_str(hdr, ">\n"); } }
// Check contig entries match reference // We check that these match the reference just loaded static void brkpnt_check_refs_match(cJSON *json, const char *path) { cJSON *version = json_hdr_get(json, "format_version", cJSON_Number, path); if(version->valueint <= 2) return; cJSON *command = json_hdr_get_curr_cmd(json, path); cJSON *brkpnts = json_hdr_get(command, "breakpoints", cJSON_Object, path); cJSON *contigs = json_hdr_get(brkpnts, "contigs", cJSON_Array, path); cJSON *contig; size_t num_chroms = 0; for(contig = contigs->child; contig; contig = contig->next, num_chroms++) { cJSON *id = json_hdr_get(contig, "id", cJSON_String, path); cJSON *len = json_hdr_get(contig, "length", cJSON_Number, path); const char *chrom_name = id->valuestring; long chrom_len = len->valueint; size_t reflen; khiter_t k = kh_get(ChromHash, genome, chrom_name); if(k == kh_end(genome)) die("Cannot find ref chrom: %s", chrom_name); else { reflen = kh_value(genome, k)->seq.end; if(reflen != (size_t)chrom_len) { die("Chrom lengths do not match %s input:%li ref:%zu", chrom_name, chrom_len, reflen); } } } if(num_chroms != chroms.len) { die("Number of chromosomes differ: %zu in header vs %zu in ref", num_chroms, chroms.len); } }
int ctx_calls2vcf(int argc, char **argv) { const char *in_path = NULL, *out_path = NULL, *out_type = NULL; // Filtering parameters int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1; // Alignment parameters int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1; // ref paths char const*const* ref_paths = NULL; size_t nref_paths = 0; // flank file const char *sam_path = NULL; // // Things we figure out by looking at the input // bool isbubble = false; // samples in VCF, (0 for bubble, does not include ref in breakpoint calls) size_t i, kmer_size, num_samples; // // Reference genome // // Hash map of chromosome name -> sequence ChromHash *genome; ReadBuffer chroms; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len < 0,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(max_align_len < 0) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); in_path = argv[optind++]; ref_paths = (char const*const*)argv + optind; nref_paths = argc - optind; // These functions call die() on error gzFile gzin = futil_gzopen(in_path, "r"); // Read call file header cJSON *json = json_hdr_load(gzin, in_path); // Check we can handle the kmer size kmer_size = json_hdr_get_kmer_size(json, in_path); db_graph_check_kmer_size(kmer_size, in_path); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(in_path), isbubble ? "bubble" : "breakpoint"); if(isbubble) { // bubble specific if(sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ; } else { // breakpoint specific if(min_mapq >= 0) cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls"); } // Open flank file if it exists htsFile *samfh = NULL; bam_hdr_t *bam_hdr = NULL; bam1_t *mflank = NULL; if(sam_path) { if((samfh = hts_open(sam_path, "r")) == NULL) die("Cannot open SAM/BAM %s", sam_path); // Load BAM header bam_hdr = sam_hdr_read(samfh); if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path); mflank = bam_init1(); } // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, in_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path); num_samples = 0; if(!isbubble) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } // // Open output file // if(!out_path) out_path = "-"; int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *vcffh = hts_open(out_path, modes_htslib[mode]); status("[calls2vcf] Reading %s call file with %zu samples", isbubble ? "Bubble" : "Breakpoint", num_graph_samples); status("[calls2vcf] %zu sample output to: %s format: %s", num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]); if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq); status("[calls2vcf] max alignment length: %i", max_align_len); status("[calls2vcf] max VCF allele length: %i", max_allele_len); status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i", nwmatch, nwmismatch, nwgapopen, nwgapextend); // Load reference genome read_buf_alloc(&chroms, 1024); genome = chrom_hash_init(); chrom_hash_load(ref_paths, nref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!isbubble) brkpnt_check_refs_match(json, genome, in_path); bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size, ref_paths, nref_paths, chroms.b, chroms.len); if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header"); AlignedCall *call = acall_init(); CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr); scoring_t *scoring = call_decomp_get_scoring(aligner); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, false, false, 0, 0, 0, 0); CallFileEntry centry; call_file_entry_alloc(¢ry); char kmer_str[50]; sprintf(kmer_str, ";K%zu", kmer_size); if(isbubble) { // Bubble calls DecompBubble *bubbles = decomp_bubble_init(); // Set scoring for aligning 3' flank scoring = decomp_bubble_get_scoring(bubbles); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, true, true, 0, 0, 0, 0); while(call_file_read(gzin, in_path, ¢ry)) { do { if(sam_read1(samfh, bam_hdr, mflank) < 0) die("We've run out of SAM entries!"); } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); // Align call strbuf_reset(&call->info); decomp_bubble_call(bubbles, genome, kmer_size, min_mapq, ¢ry, mflank, bam_hdr, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats)); decomp_bubble_cpy_stats(bub_stats, bubbles); print_bubble_stats(bub_stats); ctx_free(bub_stats); decomp_bubble_destroy(bubbles); } else { // Breakpoint calls DecompBreakpoint *breakpoints = decomp_brkpt_init(); while(call_file_read(gzin, in_path, ¢ry)) { strbuf_reset(&call->info); decomp_brkpt_call(breakpoints, genome, num_samples, ¢ry, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats)); decomp_brkpt_cpy_stats(brk_stats, breakpoints); print_breakpoint_stats(brk_stats); ctx_free(brk_stats); decomp_brkpt_destroy(breakpoints); } // Print stats DecomposeStats *astats = ctx_calloc(1, sizeof(*astats)); call_decomp_cpy_stats(astats, aligner); print_acall_stats(astats); ctx_free(astats); call_file_entry_dealloc(¢ry); call_decomp_destroy(aligner); acall_destroy(call); // Finished - clean up cJSON_Delete(json); gzclose(gzin); bcf_hdr_destroy(vcfhdr); hts_close(vcffh); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); chrom_hash_destroy(genome); if(sam_path) { hts_close(samfh); bam_hdr_destroy(bam_hdr); bam_destroy1(mflank); } return EXIT_SUCCESS; }
static bcf_hdr_t* make_vcf_hdr(cJSON *json, const char *in_path, bool is_breakpoint, size_t kmer_size, char const*const* ref_paths, size_t nref_paths, read_t *chroms, size_t nchroms) { ctx_assert(json != NULL); StrBuf hdrbuf; strbuf_alloc(&hdrbuf, 1024); char datestr[9]; time_t date = time(NULL); strftime(datestr, 9, "%Y%m%d", localtime(&date)); strbuf_append_str(&hdrbuf, "##fileformat=VCFv4.2\n##fileDate="); strbuf_append_str(&hdrbuf, datestr); strbuf_append_str(&hdrbuf, "\n"); // Print commands used to generate header cJSON *commands = json_hdr_get(json, "commands", cJSON_Array, in_path); cJSON *command = commands->child; // Print this command char keystr[8]; char *prevstr = NULL; size_t i; if(command) { cJSON *key = json_hdr_get(command, "key", cJSON_String, in_path); prevstr = key->valuestring; } // Print command entry for this command strbuf_append_str(&hdrbuf, "##mccortex_"); strbuf_append_str(&hdrbuf, hex_rand_str(keystr, sizeof(keystr))); strbuf_append_str(&hdrbuf, "=<prev=\""); strbuf_append_str(&hdrbuf, prevstr ? prevstr : "NULL"); strbuf_append_str(&hdrbuf, "\",cmd=\""); strbuf_append_str(&hdrbuf, cmd_get_cmdline()); strbuf_append_str(&hdrbuf, "\",cwd=\""); strbuf_append_str(&hdrbuf, cmd_get_cwd()); strbuf_append_str(&hdrbuf, "\",version="CTX_VERSION">\n"); // Print previous commands vcf_hdrtxt_append_commands(command, &hdrbuf, in_path); // Print field definitions if(is_breakpoint) strbuf_append_str(&hdrbuf, "##INFO=<ID=BRKPNT,Number=1,Type=String,Description=\"Breakpoint call\">\n"); else strbuf_append_str(&hdrbuf, "##INFO=<ID=BUBBLE,Number=1,Type=String,Description=\"Bubble call\">\n"); strbuf_sprintf(&hdrbuf, "##INFO=<ID=K%zu,Number=0,Type=Flag,Description=\"Found at k=%zu\">\n", kmer_size, kmer_size); strbuf_append_str(&hdrbuf, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"); strbuf_append_str(&hdrbuf, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n"); // Print reference paths strbuf_append_str(&hdrbuf, "##reference="); strbuf_append_str(&hdrbuf, ref_paths[0]); for(i = 1; i < nref_paths; i++) { strbuf_append_char(&hdrbuf, ','); strbuf_append_str(&hdrbuf, ref_paths[i]); } strbuf_append_str(&hdrbuf, "\n"); // Print contigs lengths for(i = 0; i < nchroms; i++) { strbuf_sprintf(&hdrbuf, "##contig=<ID=%s,length=%zu>\n", chroms[i].name.b, chroms[i].seq.end); } // Print VCF column header strbuf_append_str(&hdrbuf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); if(is_breakpoint) { // Print a column for each sample cJSON *graph_json = json_hdr_get(json, "graph", cJSON_Object, in_path); cJSON *colours_json = json_hdr_get(graph_json, "colours", cJSON_Array, in_path); cJSON *colour_json = colours_json->child; if(colour_json == NULL) die("Missing colours"); for(; colour_json; colour_json = colour_json->next) { if(!json_hdr_colour_is_ref(colour_json)) { cJSON *sample_json = json_hdr_get(colour_json, "sample", cJSON_String, in_path); strbuf_append_str(&hdrbuf, "\t"); strbuf_append_str(&hdrbuf, sample_json->valuestring); } } } strbuf_append_char(&hdrbuf, '\n'); bcf_hdr_t *hdr = bcf_hdr_init("w"); if(bcf_hdr_parse(hdr, hdrbuf.b) != 0) die("Cannot construct VCF header"); strbuf_dealloc(&hdrbuf); return hdr; }
static void print_vcf_header(cJSON *json, bool is_breakpoint, FILE *fout) { ctx_assert(json != NULL); char datestr[9]; time_t date = time(NULL); strftime(datestr, 9, "%Y%m%d", localtime(&date)); fprintf(fout, "##fileformat=VCFv4.1\n##fileDate=%s\n", datestr); // Print commands used to generate header cJSON *commands = json_hdr_get(json, "commands", cJSON_Array, input_path); cJSON *command = commands->child; // Print this command char keystr[8]; char *prevstr = NULL; size_t i; if(command) { cJSON *key = json_hdr_get(command, "key", cJSON_String, input_path); prevstr = key->valuestring; } // Print command entry for this command fprintf(fout, "##mccortex_%s=<prev=\"%s\",cmd=\"%s\",cwd=\"%s\",version="CTX_VERSION">\n", hex_rand_str(keystr, sizeof(keystr)), prevstr ? prevstr : "NULL", cmd_get_cmdline(), cmd_get_cwd()); // Print previous commands for(; command != NULL; command = command->next) { cJSON *key = json_hdr_get(command, "key", cJSON_String, input_path); cJSON *cmd = json_hdr_get(command, "cmd", cJSON_Array, input_path); cJSON *cwd = json_hdr_get(command, "cwd", cJSON_String, input_path); cJSON *prev = json_hdr_get(command, "prev", cJSON_Array, input_path); cJSON *ver = json_hdr_try(command, "mccortex",cJSON_String, input_path); prev = prev->child; // result could be NULL if(prev && prev->type != cJSON_String) die("Invalid 'prev' field"); fprintf(fout, "##mccortex_%s=<prev=\"%s", key->valuestring, prev ? prev->valuestring : "NULL"); if(prev) { while((prev = prev->next) != NULL) fprintf(fout, ";%s", prev->valuestring); } fprintf(fout, "\",cmd=\""); for(i = 0, cmd = cmd->child; cmd; cmd = cmd->next, i++) { if(i > 0) fputc(' ', fout); fputs(cmd->valuestring, fout); } fprintf(fout, "\",cwd=\"%s\"", cwd->valuestring); if(ver) { fprintf(fout, ",version=\"%s\"", ver->valuestring); } fprintf(fout, ">\n"); } // Print field definitions if(is_breakpoint) fprintf(fout, "##INFO=<ID=BRKPNT,Number=1,Type=String,Description=\"Breakpoint call\">\n"); else fprintf(fout, "##INFO=<ID=BUBBLE,Number=1,Type=String,Description=\"Bubble call\">\n"); fprintf(fout, "##INFO=<ID=K%zu,Number=0,Type=Flag,Description=\"Found at k=%zu\">\n", kmer_size, kmer_size); fprintf(fout, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"); fprintf(fout, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n"); // Print reference paths fprintf(fout, "##reference=%s", ref_paths[0]); for(i = 1; i < num_ref_paths; i++) printf(",%s", ref_paths[i]); fprintf(fout, "\n"); // Print contigs lengths for(i = 0; i < chroms.len; i++) { fprintf(fout, "##contig=<ID=%s,length=%zu>\n", chroms.b[i].name.b, chroms.b[i].seq.end); } // Print VCF column header fputs("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", fout); if(is_breakpoint) { // Print a column for each sample cJSON *graph_json = json_hdr_get(json, "graph", cJSON_Object, input_path); cJSON *colours_json = json_hdr_get(graph_json, "colours", cJSON_Array, input_path); cJSON *colour_json = colours_json->child; if(colour_json == NULL) die("Missing colours"); for(; colour_json; colour_json = colour_json->next) { if(!json_hdr_colour_is_ref(colour_json)) { cJSON *sample_json = json_hdr_get(colour_json, "sample", cJSON_String, input_path); fputc('\t', fout); fputs(sample_json->valuestring, fout); } } } fputc('\n', fout); }
int ctx_calls2vcf(int argc, char **argv) { parse_cmdline_args(argc, argv); size_t i; // These functions call die() on error gzFile gzin = futil_gzopen(input_path, "r"); nw_aligner_setup(); // Read file header cJSON *json = read_input_header(gzin); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, input_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) input_bubble_format = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) input_bubble_format = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(input_path), input_bubble_format ? "bubble" : "breakpoint"); if(input_bubble_format && sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); // Open flank file if it exists if(sam_path) flanks_sam_open(); // Open output file FILE *fout = futil_fopen_create(out_path, "w"); // Load reference genome read_buf_alloc(&chroms, 1024); genome = kh_init(ChromHash); seq_reader_load_ref_genome(ref_paths, num_ref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!input_bubble_format) brkpnt_check_refs_match(json, input_path); // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, input_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, input_path); num_samples = 0; if(!input_bubble_format) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } print_vcf_header(json, !input_bubble_format, fout); status("Reading %s call file with %zu samples", input_bubble_format ? "Bubble" : "Breakpoint", num_graph_samples); status("Writing a VCF with %zu samples", num_samples); parse_entries(gzin, fout); // Print stats char num_entries_read_str[50]; char num_vars_printed_str[50]; ulong_to_str(num_entries_read, num_entries_read_str); ulong_to_str(num_vars_printed, num_vars_printed_str); status("Read %s entries, printed %s vcf entries to: %s", num_entries_read_str, num_vars_printed_str, futil_outpath_str(out_path)); if(input_bubble_format) { char msg[200]; // Bubble caller specific print_stat(num_flank5p_unmapped, num_entries_read, "flank 5p unmapped"); sprintf(msg, "flank 5p low mapq (<%zu)", min_mapq); print_stat(num_flank5p_lowqual, num_entries_read, msg); print_stat(num_flank3p_not_found, num_entries_read, "flank 3p not found"); print_stat(num_flank3p_multihits, num_entries_read, "flank 3p multiple hits"); print_stat(num_flank3p_approx_match,num_entries_read, "flank 3p approx match used"); print_stat(num_flank3p_exact_match, num_entries_read, "flank 3p exact match"); } else { // Breakpoint caller specific print_stat(num_flanks_not_uniquely_mapped, num_entries_read, "flank pairs contain one flank not mapped uniquely"); print_stat(num_flanks_diff_chroms, num_entries_read, "flank pairs map to diff chroms"); print_stat(num_flanks_diff_strands, num_entries_read, "flank pairs map to diff strands"); } print_stat(num_flanks_too_far_apart, num_entries_read, "flank pairs too far apart"); print_stat(num_flanks_overlap_too_large, num_entries_read, "flank pairs overlap too much"); print_stat(num_entries_well_mapped, num_entries_read, "flank pairs map well"); status("Aligned %zu allele pairs and %zu flanks", num_nw_allele, num_nw_flank); // Finished - clean up cJSON_Delete(json); gzclose(gzin); fclose(fout); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); kh_destroy_ChromHash(genome); nw_aligner_destroy(); if(sam_path) flanks_sam_close(); // hide unused method warnings (void)kh_del_ChromHash; (void)kh_put_ChromHash; (void)kh_get_ChromHash; (void)kh_clear_ChromHash; (void)kh_destroy_ChromHash; (void)kh_init_ChromHash; return EXIT_SUCCESS; }
int ctx_links(int argc, char **argv) { size_t limit = 0; const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL; const char *thresh_path = NULL, *hist_path = NULL; size_t hist_distsize = 0, hist_covgsize = 0; size_t cutoff = 0; bool clean = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break; case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break; case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break; case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break; case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break; case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break; case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break; case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist"); if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist"); // Defaults if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST; if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG; if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments"); const char *ctp_path = argv[optind]; bool list = (csv_out_path != NULL); bool plot = (plot_out_path != NULL); bool save = (link_out_path != NULL); bool hist_covg = (thresh_path != NULL || hist_path != NULL); size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1); if(clean && !save) cmd_print_usage("Need to give --out <out.ctp.gz> with --clean"); if(!save && !list && !plot && !hist_covg) cmd_print_usage("Please specify one of --plot, --list or --clean"); if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0) cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!"); // Open input file FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL; FILE *thresh_fh = NULL, *hist_fh = NULL; gzFile link_gz = NULL; // Check file don't exist or that we can overwrite // Will ignore if path is null bool err = false; err |= futil_check_outfile(csv_out_path); err |= futil_check_outfile(plot_out_path); err |= futil_check_outfile(link_out_path); err |= futil_check_outfile(thresh_path); err |= futil_check_outfile(hist_path); if(err) die("Use -f,--force to overwrite files"); StrBuf link_tmp_path; strbuf_alloc(&link_tmp_path, 1024); GPathReader ctpin; memset(&ctpin, 0, sizeof(ctpin)); gpath_reader_open(&ctpin, ctp_path); size_t ncols = file_filter_into_ncols(&ctpin.fltr); size_t kmer_size = gpath_reader_get_kmer_size(&ctpin); cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1); if(ncols != 1) die("Can only clean a single colour at a time. Sorry."); uint64_t (*hists)[hist_covgsize] = NULL; if(hist_covg) { hists = ctx_calloc(hist_distsize, sizeof(hists[0])); } if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL) die("Cannot open file: %s", hist_path); if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL) die("Cannot open file: %s", thresh_path); if(limit) status("Limiting to the first %zu kmers", limit); if(clean) { timestamp(); message(" Cleaning coverage below %zu", cutoff); message("\n"); } if(save) { // Check we can find the fields we need cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); if(!nkmers_json || !nlinks_json || !nbytes_json) die("Cannot find required header entries"); // Create a random temporary file link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path); status("Saving output to: %s", link_out_path); status("Temporary output: %s", link_tmp_path.b); // Open output file if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL) die("Cannot open output link file: %s", link_out_path); // Need to open output file first so we can get absolute path // Update the header to include this command json_hdr_add_curr_cmd(newhdr, link_out_path); } if(list) { status("Listing to %s", csv_out_path); if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL) die("Cannot open output CSV file %s", csv_out_path); // Print csv header fprintf(list_fh, "SeqLen,Covg\n"); } if(plot) { status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path); if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL) die("Cannot open output .dot file %s", plot_out_path); } SizeBuffer countbuf, jposbuf; size_buf_alloc(&countbuf, 16); size_buf_alloc(&jposbuf, 1024); StrBuf kmerbuf, juncsbuf, seqbuf, outbuf; strbuf_alloc(&kmerbuf, 1024); strbuf_alloc(&juncsbuf, 1024); strbuf_alloc(&seqbuf, 1024); strbuf_alloc(&outbuf, 1024); bool link_fw; size_t njuncs; size_t knum, nlinks, num_links_exp = 0; LinkTree ltree; ltree_alloc(<ree, kmer_size); LinkTreeStats tree_stats; memset(&tree_stats, 0, sizeof(tree_stats)); size_t init_num_links = 0, num_links = 0; for(knum = 0; !limit || knum < limit; knum++) { ltree_reset(<ree); if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break; ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu", kmerbuf.end, kmer_size); // status("kmer: %s", kmerbuf.b); for(nlinks = 0; gpath_reader_read_link(&ctpin, &link_fw, &njuncs, &countbuf, &juncsbuf, &seqbuf, &jposbuf); nlinks++) { ltree_add(<ree, link_fw, countbuf.b[0], jposbuf.b, juncsbuf.b, seqbuf.b); } if(nlinks != num_links_exp) warn("Links count mismatch %zu != %zu", nlinks, num_links_exp); if(hist_covg) { ltree_update_covg_hists(<ree, (uint64_t*)hists, hist_distsize, hist_covgsize); } if(clean) { ltree_clean(<ree, cutoff); } // Accumulate statistics ltree_get_stats(<ree, &tree_stats); num_links = tree_stats.num_links - init_num_links; init_num_links = tree_stats.num_links; if(list) { ltree_write_list(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end) die("Cannot write CSV file to: %s", csv_out_path); strbuf_reset(&outbuf); } if(save && num_links) { ltree_write_ctp(<ree, kmerbuf.b, num_links, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end) die("Cannot write ctp file to: %s", link_tmp_path.b); strbuf_reset(&outbuf); } if(plot && knum == plot_kmer_idx) { status("Plotting tree..."); ltree_write_dot(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end) die("Cannot write plot DOT file to: %s", plot_out_path); strbuf_reset(&outbuf); } } gpath_reader_close(&ctpin); cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links); status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links); status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes); if(save) { // Update JSON nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links; nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links; nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes; char *json_str = cJSON_Print(newhdr); if(gzputs(link_gz, json_str) != (int)strlen(json_str)) die("Cannot write ctp file to: %s", link_out_path); free(json_str); gzputs(link_gz, "\n\n"); gzputs(link_gz, ctp_explanation_comment); gzputs(link_gz, "\n"); fseek(link_tmp_fh, 0, SEEK_SET); char *tmp = ctx_malloc(4*ONE_MEGABYTE); size_t s; while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) { if(gzwrite(link_gz, tmp, s) != (int)s) die("Cannot write to output: %s", link_out_path); } ctx_free(tmp); gzclose(link_gz); fclose(link_tmp_fh); } // Write histogram to file if(hist_fh) { size_t i, j; fprintf(hist_fh, " "); for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j); fprintf(hist_fh, "\n"); for(i = 1; i < hist_distsize; i++) { fprintf(hist_fh, "dist.%02zu", i); for(j = 1; j < hist_covgsize; j++) { fprintf(hist_fh, ",%"PRIu64, hists[i][j]); } fprintf(hist_fh, "\n"); } } if(thresh_fh) { // Use median of first five cutoffs print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh); } if(hist_fh && hist_fh != stdout) fclose(hist_fh); if(list) { fclose(list_fh); } if(plot) { fclose(plot_fh); } ctx_free(hists); cJSON_Delete(newhdr); strbuf_dealloc(&link_tmp_path); ltree_dealloc(<ree); size_buf_dealloc(&countbuf); size_buf_dealloc(&jposbuf); strbuf_dealloc(&kmerbuf); strbuf_dealloc(&juncsbuf); strbuf_dealloc(&seqbuf); strbuf_dealloc(&outbuf); return EXIT_SUCCESS; }