// Safe to call on different entries at the same time // NOT safe to do find() whilst doing delete() void hash_table_delete(HashTable *const ht, hkey_t pos) { uint64_t bucket = pos / ht->bucket_size, n, m; ctx_assert(pos != HASH_NOT_FOUND); ctx_assert(HASH_ENTRY_ASSIGNED(ht->table[pos])); memset(ht->table+pos, 0, sizeof(BinaryKmer)); n = __sync_fetch_and_sub((volatile uint64_t *)&ht->num_kmers, 1); m = __sync_fetch_and_sub((volatile uint8_t *)&ht->buckets[bucket][HT_BITEMS], 1); ctx_assert2(n > 0, "Deleted from empty table"); ctx_assert2(m > 0, "Deleted from empty bucket"); ctx_assert(!HASH_ENTRY_ASSIGNED(ht->table[pos])); }
void filter_reads(AsyncIOData *data, void *arg) { (void)arg; read_t *r1 = (read_t*)&data->r1, *r2 = data->r2.seq.end ? (read_t*)&data->r2 : NULL; AlignReadsData *input = (AlignReadsData*)data->ptr; const dBGraph *db_graph = input->db_graph; LoadingStats *stats = input->stats; ctx_assert2(r2 == NULL || input->seqout.is_pe, "Were not expecting r2: %p %i", r2, (int)input->seqout.is_pe); bool touches_graph = read_touches_graph(r1, db_graph, stats) || (r2 != NULL && read_touches_graph(r2, db_graph, stats)); if(touches_graph != input->invert) { seqout_print(&input->seqout, r1, r2); input->num_of_reads_printed += 1 + (r2 != NULL); } if(r2 == NULL) __sync_add_and_fetch((volatile size_t*)&stats->num_se_reads, 1); else __sync_add_and_fetch((volatile size_t*)&stats->num_pe_reads, 2); size_t n = __sync_add_and_fetch(&read_counter, 1); ctx_update("FilterReads", n); }
// Using file so can call fseek and don't need to load whole graph static size_t inferedges_on_mmap(const dBGraph *db_graph, bool add_all_edges, GraphFileReader *file) { ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols); ctx_assert(file_filter_is_direct(&file->fltr)); ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead"); ctx_assert(file->num_of_kmers >= 0); ctx_assert(file->file_size >= 0); status("[inferedges] Processing mmap file: %s [hdr: %zu bytes file: %zu bytes]", file_filter_path(&file->fltr), (size_t)file->hdr_size, (size_t)file->file_size); if(fseek(file->fh, 0, SEEK_SET) != 0) die("fseek failed: %s", strerror(errno)); // Open memory mapped file void *mmap_ptr = mmap(NULL, file->file_size, PROT_WRITE, MAP_SHARED, fileno(file->fh), 0); if(mmap_ptr == MAP_FAILED) die("Cannot memory map file: %s [%s]", file->fltr.path.b, strerror(errno)); const size_t ncols = file->hdr.num_of_cols; BinaryKmer bkmer; Edges edges[ncols]; Covg covgs[ncols]; bool updated; size_t i, num_kmers = file->num_of_kmers, num_kmers_edited = 0; size_t filekmersize = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg)) * ncols; char *ptr = (char*)mmap_ptr + file->hdr_size; for(i = 0; i < num_kmers; i++, ptr += filekmersize) { char *fh_covgs = ptr + sizeof(BinaryKmer); char *fh_edges = fh_covgs + sizeof(Covg)*ncols; memcpy(bkmer.b, ptr, sizeof(BinaryKmer)); memcpy(covgs, fh_covgs, ncols * sizeof(Covg)); memcpy(edges, fh_edges, ncols * sizeof(Edges)); updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph) : infer_pop_edges(bkmer, edges, covgs, db_graph)); if(updated) { memcpy(fh_covgs, covgs, ncols * sizeof(Covg)); memcpy(fh_edges, edges, ncols * sizeof(Edges)); num_kmers_edited++; } } if(munmap(mmap_ptr, file->file_size) == -1) die("Cannot release mmap file: %s [%s]", file->fltr.path.b, strerror(errno)); return num_kmers_edited; }
void seq_reader_orient_mp_FF_or_RR(read_t *r1, read_t *r2, ReadMateDir matedir) { ctx_assert(r1 != NULL); ctx_assert(r2 != NULL); switch(matedir) { case READPAIR_FF: return; case READPAIR_FR: seq_read_reverse_complement(r2); return; case READPAIR_RF: seq_read_reverse_complement(r1); return; case READPAIR_RR: return; default: ctx_assert2(0, "Invalid ReadMateDir value: %i", (int)matedir); } // ^default should be unreachable }
void acall_decompose(CallDecomp *dc, const AlignedCall *call, size_t max_line_len, size_t max_allele_len) { dc->stats.ncalls++; if(call->chrom == NULL) { return; } dc->stats.ncalls_mapped++; const read_t *chrom = call->chrom; const char *ref_allele = chrom->seq.b + call->start; size_t i, ref_len = call->end - call->start; const StrBuf *alt; ctx_assert2(call->start <= call->end, "%u .. %u", call->start, call->end); if(ref_len > max_line_len) { dc->stats.ncalls_ref_allele_too_long++; return; // can't align } dc->stats.nlines += call->n_lines; // printf("chr:%s %u - %u\n", call->chrom->name.b, call->start, call->end); for(i = 0; i < call->n_lines; i++) { alt = &call->lines[i]; ctx_assert(strlen(alt->b) == alt->end); // Quick check if sequence too long or are matching if(alt->end > max_line_len) { dc->stats.nlines_too_long++; } else if(ref_len == alt->end && strncasecmp(ref_allele, alt->b, ref_len) == 0) { dc->stats.nlines_match_ref++; } else { // printf("REF: '%*.s' [%zu]\n", (int)ref_len, ref_allele, ref_len); // printf("ALT: '%*.s' [%zu]\n", (int)alt->end, alt->b, alt->end); needleman_wunsch_align2(ref_allele, alt->b, ref_len, alt->end, dc->scoring, dc->nw_aligner, dc->aln); // printf("ALNA: %s\n", dc->aln->result_a); // printf("ALNB: %s\n", dc->aln->result_b); align_biallelic(dc->aln->result_a, dc->aln->result_b, chrom, call->gts+i*call->n_samples, call->n_samples, dc, call, max_allele_len); dc->stats.nlines_mapped++; } } }
static void parse_cmdline_args(int argc, char **argv) { // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq == SIZE_MAX,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len == SIZE_MAX,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len == SIZE_MAX,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'D': cmd_check(max_path_diff == SIZE_MAX, cmd); max_path_diff = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" calls2vcf -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = default_out_path; if(min_mapq == SIZE_MAX) min_mapq = DEFAULT_MIN_MAPQ; if(max_align_len == SIZE_MAX) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len == SIZE_MAX) max_allele_len = DEFAULT_MAX_ALLELE; if(max_path_diff == SIZE_MAX) max_path_diff = DEFAULT_MAX_PDIFF; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); input_path = argv[optind++]; ref_paths = argv + optind; num_ref_paths = argc - optind; }
/** * Remove entries from `src` that are in `dst`, copying over sample counts */ void gpath_subset_merge(GPathSubset *dst, GPathSubset *src) { ctx_assert2(dst->gpset->ncols == src->gpset->ncols, "%zu vs %zu", dst->gpset->ncols, src->gpset->ncols); if(!dst->is_sorted) gpath_subset_sort(dst); if(!src->is_sorted) gpath_subset_sort(src); size_t i = 0, j = 0, ncols = dst->gpset->ncols; int cmp; GPath **dstlist = dst->list.b; GPath **srclist = src->list.b; if(dst->list.len == 0 || src->list.len == 0) return; while(i < dst->list.len && j < src->list.len) { cmp = gpath_cmp(dstlist[i], srclist[j]); if(cmp < 0) i++; else if(cmp > 0) j++; else { // paths match, steal colours and remove it gpath_colset_or_mt(dstlist[i], srclist[j], ncols); gpath_set_nseen_sum_mt(dstlist[i], dst->gpset, srclist[j], src->gpset); srclist[j] = NULL; j++; } } // Remove NULLs from src for(i = j = 0; i < src->list.len; i++) if(srclist[i] != NULL) srclist[j++] = srclist[i]; src->list.len = j; }
// Using file so can call fseek and don't need to load whole graph static size_t inferedges_on_file(const dBGraph *db_graph, bool add_all_edges, GraphFileReader *file, FILE *fout) { ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols); ctx_assert(file_filter_is_direct(&file->fltr)); ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead"); ctx_assert(fout != NULL); ctx_assert(fileno(file->fh) != fileno(fout)); status("[inferedges] Processing file: %s", file_filter_path(&file->fltr)); // Print header graph_write_header(fout, &file->hdr); // Read the input file again if(fseek(file->fh, file->hdr_size, SEEK_SET) != 0) die("fseek failed: %s", strerror(errno)); const size_t ncols = file->hdr.num_of_cols; BinaryKmer bkmer; Edges edges[ncols]; Covg covgs[ncols]; size_t num_kmers_edited = 0; bool updated; while(graph_file_read_reset(file, ncols, &bkmer, covgs, edges)) { updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph) : infer_pop_edges(bkmer, edges, covgs, db_graph)); graph_write_kmer(fout, file->hdr.num_of_bitfields, file->hdr.num_of_cols, bkmer, covgs, edges); num_kmers_edited += updated; } return num_kmers_edited; }
enum AssemStopCause graphstep2assem(enum GraphStepStatus step, bool hit_cycle, bool low_step_confid, bool low_cumul_confid) { // There should only be one reason to stop traversal ctx_assert2((!grap_step_status_is_good(step) + !!hit_cycle + !!low_step_confid + !!low_cumul_confid) == 1, "One and only one should be true %i %i %i %i", (int)step, (int)hit_cycle, (int)low_step_confid, (int)low_cumul_confid); if(hit_cycle) return ASSEM_STOP_CYCLE; if(low_step_confid) return ASSEM_STOP_LOW_STEP_CONF; if(low_cumul_confid) return ASSEM_STOP_LOW_CUMUL_CONF; switch(step) { case GRPHWLK_NOCOVG: return ASSEM_STOP_NOCOVG; case GRPHWLK_NOCOLCOVG: return ASSEM_STOP_NOCOLCOVG; case GRPHWLK_NOPATHS: return ASSEM_STOP_NOPATHS; case GRPHWLK_SPLIT_PATHS: return ASSEM_STOP_SPLIT_PATHS; case GRPHWLK_MISSING_PATHS: return ASSEM_STOP_MISSING_PATHS; default: die("Unknown %i", (int)step); } }
int ctx_calls2vcf(int argc, char **argv) { const char *in_path = NULL, *out_path = NULL, *out_type = NULL; // Filtering parameters int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1; // Alignment parameters int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1; // ref paths char const*const* ref_paths = NULL; size_t nref_paths = 0; // flank file const char *sam_path = NULL; // // Things we figure out by looking at the input // bool isbubble = false; // samples in VCF, (0 for bubble, does not include ref in breakpoint calls) size_t i, kmer_size, num_samples; // // Reference genome // // Hash map of chromosome name -> sequence ChromHash *genome; ReadBuffer chroms; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len < 0,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(max_align_len < 0) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); in_path = argv[optind++]; ref_paths = (char const*const*)argv + optind; nref_paths = argc - optind; // These functions call die() on error gzFile gzin = futil_gzopen(in_path, "r"); // Read call file header cJSON *json = json_hdr_load(gzin, in_path); // Check we can handle the kmer size kmer_size = json_hdr_get_kmer_size(json, in_path); db_graph_check_kmer_size(kmer_size, in_path); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(in_path), isbubble ? "bubble" : "breakpoint"); if(isbubble) { // bubble specific if(sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ; } else { // breakpoint specific if(min_mapq >= 0) cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls"); } // Open flank file if it exists htsFile *samfh = NULL; bam_hdr_t *bam_hdr = NULL; bam1_t *mflank = NULL; if(sam_path) { if((samfh = hts_open(sam_path, "r")) == NULL) die("Cannot open SAM/BAM %s", sam_path); // Load BAM header bam_hdr = sam_hdr_read(samfh); if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path); mflank = bam_init1(); } // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, in_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path); num_samples = 0; if(!isbubble) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } // // Open output file // if(!out_path) out_path = "-"; int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *vcffh = hts_open(out_path, modes_htslib[mode]); status("[calls2vcf] Reading %s call file with %zu samples", isbubble ? "Bubble" : "Breakpoint", num_graph_samples); status("[calls2vcf] %zu sample output to: %s format: %s", num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]); if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq); status("[calls2vcf] max alignment length: %i", max_align_len); status("[calls2vcf] max VCF allele length: %i", max_allele_len); status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i", nwmatch, nwmismatch, nwgapopen, nwgapextend); // Load reference genome read_buf_alloc(&chroms, 1024); genome = chrom_hash_init(); chrom_hash_load(ref_paths, nref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!isbubble) brkpnt_check_refs_match(json, genome, in_path); bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size, ref_paths, nref_paths, chroms.b, chroms.len); if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header"); AlignedCall *call = acall_init(); CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr); scoring_t *scoring = call_decomp_get_scoring(aligner); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, false, false, 0, 0, 0, 0); CallFileEntry centry; call_file_entry_alloc(¢ry); char kmer_str[50]; sprintf(kmer_str, ";K%zu", kmer_size); if(isbubble) { // Bubble calls DecompBubble *bubbles = decomp_bubble_init(); // Set scoring for aligning 3' flank scoring = decomp_bubble_get_scoring(bubbles); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, true, true, 0, 0, 0, 0); while(call_file_read(gzin, in_path, ¢ry)) { do { if(sam_read1(samfh, bam_hdr, mflank) < 0) die("We've run out of SAM entries!"); } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); // Align call strbuf_reset(&call->info); decomp_bubble_call(bubbles, genome, kmer_size, min_mapq, ¢ry, mflank, bam_hdr, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats)); decomp_bubble_cpy_stats(bub_stats, bubbles); print_bubble_stats(bub_stats); ctx_free(bub_stats); decomp_bubble_destroy(bubbles); } else { // Breakpoint calls DecompBreakpoint *breakpoints = decomp_brkpt_init(); while(call_file_read(gzin, in_path, ¢ry)) { strbuf_reset(&call->info); decomp_brkpt_call(breakpoints, genome, num_samples, ¢ry, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats)); decomp_brkpt_cpy_stats(brk_stats, breakpoints); print_breakpoint_stats(brk_stats); ctx_free(brk_stats); decomp_brkpt_destroy(breakpoints); } // Print stats DecomposeStats *astats = ctx_calloc(1, sizeof(*astats)); call_decomp_cpy_stats(astats, aligner); print_acall_stats(astats); ctx_free(astats); call_file_entry_dealloc(¢ry); call_decomp_destroy(aligner); acall_destroy(call); // Finished - clean up cJSON_Delete(json); gzclose(gzin); bcf_hdr_destroy(vcfhdr); hts_close(vcffh); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); chrom_hash_destroy(genome); if(sam_path) { hts_close(samfh); bam_hdr_destroy(bam_hdr); bam_destroy1(mflank); } return EXIT_SUCCESS; }
static void parse_entries(gzFile gzin, FILE *fout) { CallFileEntry centry; call_file_entry_alloc(¢ry); ChromPosBuffer chrposbuf; chrompos_buf_alloc(&chrposbuf, 32); StrBuf tmpbuf, flank3pbuf; strbuf_alloc(&tmpbuf, 1024); strbuf_alloc(&flank3pbuf, 1024); const char *flank5p, *flank3p; size_t flank5p_len, flank3p_len; size_t cpy_flnk_5p, cpy_flnk_3p; const read_t *chrom = NULL; size_t ref_start = 0, ref_end = 0; bool mapped = false, fw_strand = false; const char **genotypes = NULL; if(!input_bubble_format) genotypes = ctx_calloc(num_samples, sizeof(char*)); for(; call_file_read(gzin, input_path, ¢ry); num_entries_read++) { size_t nlines = call_file_num_lines(¢ry); ctx_assert2(!(nlines&1) && nlines >= 6, "Too few lines: %zu", nlines); flank5p = call_file_get_line(¢ry,1); flank5p_len = call_file_line_len(¢ry,1); cpy_flnk_5p = cpy_flnk_3p = 0; // Read a corresponding SAM entry if(input_bubble_format) { // Trim down alleles, add to 3p flank bubble_trim_alleles(¢ry, &flank3pbuf); flank3p = flank3pbuf.b; flank3p_len = flank3pbuf.end; mapped = sam_fetch_coords(¢ry, flank5p, flank5p_len, flank3p, flank3p_len, &cpy_flnk_5p, &cpy_flnk_3p, &chrom, &ref_start, &ref_end, &fw_strand); } else { flank3p = call_file_get_line(¢ry, 3); flank3p_len = call_file_line_len(¢ry, 3); mapped = brkpnt_fetch_coords(¢ry, &chrposbuf, &chrom, &ref_start, &ref_end, &fw_strand, &cpy_flnk_5p, &cpy_flnk_3p); } if(mapped) { // Get call id const char *hdrline = call_file_get_line(¢ry, 0); char callid[100]; int r = get_callid_str(hdrline, input_bubble_format, callid, sizeof(callid)); if(r == -1) die("Poorly formatted: %s", hdrline); if(r == -2) die("Call id string is too long: %s", hdrline); align_entry(¢ry, callid, flank5p, flank5p_len, flank3p, flank3p_len, cpy_flnk_5p, cpy_flnk_3p, chrom, ref_start, ref_end, fw_strand, &tmpbuf, genotypes, fout); } } ctx_free(genotypes); call_file_entry_dealloc(¢ry); chrompos_buf_dealloc(&chrposbuf); strbuf_dealloc(&tmpbuf); strbuf_dealloc(&flank3pbuf); }
/** * Pick a cleaning threshold from kmer coverage histogram. Assumes low coverage * kmers are all due to error. Fits a poisson with a gamma distributed mean. * Then chooses a cleaning threshold such than FDR (uncleaned kmers) occur at a * rate of < the FDR paramater. * * Translated from Gil McVean's initial proposed method in R code * * @param kmer_covg Histogram of kmer counts at coverages 1,2,.. arrlen-1 * @param arrlen Length of array kmer_covg * @param alpha_est_ptr If not NULL, used to return estimate for alpha * @param beta_est_ptr If not NULL, used to return estimate for beta * @return -1 if no cut-off satisfies FDR, otherwise returns coverage cutoff */ int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen, double *alpha_est_ptr, double *beta_est_ptr, double *false_pos_ptr, double *false_neg_ptr) { ctx_assert(arrlen >= 10); ctx_assert2(kmer_covg[0] == 0, "Shouldn't see any kmers with coverage zero"); size_t i, min_a_est_idx = 0; double r1, r2, rr, min_a_est = DBL_MAX, tmp; double aa, faa, a_est, b_est, c0; r1 = (double)kmer_covg[2] / kmer_covg[1]; r2 = (double)kmer_covg[3] / kmer_covg[2]; rr = r2 / r1; // printf("r1: %.2f r2: %.2f rr: %.2f\n", r1, r2, rr); // iterate aa = { 0.01, 0.02, ..., 1.99, 2.00 } // find aa value that minimises abs(faa-rr) for(i = 1; i <= 200; i++) { aa = i*0.01; faa = tgamma(aa)*tgamma(aa+2) / (2*pow(tgamma(aa+1),2)); tmp = fabs(faa-rr); if(tmp < min_a_est) { min_a_est = tmp; min_a_est_idx = i; } } // a_est, b_est are estimates for alpha, beta of gamma distribution a_est = min_a_est_idx*0.01; b_est = tgamma(a_est + 1.0) / (r1 * tgamma(a_est)) - 1.0; b_est = MAX2(b_est, 1); // Avoid beta values <1 c0 = kmer_covg[1] * pow(b_est/(1+b_est),-a_est); if(alpha_est_ptr) *alpha_est_ptr = a_est; if(beta_est_ptr) *beta_est_ptr = b_est; // printf("min_a_est_idx: %zu\n", min_a_est_idx); // printf("a_est: %f b_est %f c0: %f\n", a_est, b_est, c0); // keep coverage estimates on the stack - this should be ok double e_covg_tmp, e_covg[arrlen]; double e_total = 0; uint64_t d_total = 0; // Calculate some values here for speed double log_b_est = log(b_est); double log_one_plus_b_est = log(1 + b_est); double lgamma_a_est = lgamma(a_est); // note: lfactorial(x) = lgamma(x+1) for(i = 1; i < arrlen; i++) { e_covg_tmp = a_est * log_b_est - lgamma_a_est - lgamma(i) + lgamma(a_est + i - 1) - (a_est + i - 1) * log_one_plus_b_est; e_covg[i] = exp(e_covg_tmp) * c0; e_total += e_covg[i]; d_total += kmer_covg[i]; } // for(i = 1; i < MIN2(arrlen,100); i++) // printf(" %zu: %f %zu\n", i, e_covg[i], (size_t)kmer_covg[i]); int cutoff = -1; // Find cutoff by finding first coverage level where errors make up less than // 0.1% of total coverage cutoff = pick_cutoff_with_fdr_thresh(e_covg, kmer_covg, arrlen, 0.001); // printf("A cutoff: %i\n", cutoff); // Pick highest cutoff that keeps FP < FN if(cutoff < 0) cutoff = pick_cutoff_FP_lt_FN(e_covg, e_total, kmer_covg, d_total, arrlen); if(cutoff < 0) cutoff = pick_cutoff_loss_vs_error(e_covg, e_total, kmer_covg, arrlen); // printf("B cutoff: %i\n", cutoff); if(cutoff < 0) return -1; // printf("C cutoff: %i\n", cutoff); // Check cutoff keeps at least 20% of coverage // (WGS should be much higher, Exome sequencing needs low cutoff) if(!is_cutoff_good(kmer_covg, arrlen, cutoff, 0.2)) return -1; // printf("D cutoff: %i\n", cutoff); // Calculate FP,FN rates if(false_pos_ptr || false_neg_ptr) { double false_pos = 0, false_neg = 0; cutoff_get_FP_FN(e_covg, e_total, kmer_covg, d_total, cutoff, &false_pos, &false_neg); // printf(" FP: %f, FN: %f\n", false_pos, false_neg); if(false_pos_ptr) *false_pos_ptr = false_pos; if(false_neg_ptr) *false_neg_ptr = false_neg; } // printf(" kmers_above : %zu / (%zu + %zu) = %f\n", // kmers_above, kmers_below, kmers_above, // (double)kmers_above/(kmers_below+kmers_above)); // printf("cutoff: %i\n", cutoff); // printf(" cutoff: %zu fdr: %f fdr_limit: %f good: %i\n", // cutoff, fdr, fdr_limit, (int)good_cutoff); return cutoff; }
/** * Pick a cleaning threshold from kmer coverage histogram. Assumes low coverage * kmers are all due to error, to which it fits a gamma distribution. Then * chooses a cleaning threshold such that FDR (uncleaned kmers) occur at a rate * of < the FDR paramater. * * Translated from Gil McVean's proposed method in R code * * @param kmer_covg Histogram of kmer counts at coverages 1,2,.. arrlen-1 * @param arrlen Length of array kmer_covg * @param fdr_limit False discovery rate for a single kmer coverage * (1/1000 i.e. 0.001 is reasonable) * @param alpha_est_ptr If not NULL, used to return estimate for alpha * @param beta_est_ptr If not NULL, used to return estimate for beta * @return -1 if no cut-off satisfies FDR, otherwise returns coverage cutoff */ int cleaning_pick_kmer_threshold(const uint64_t *kmer_covg, size_t arrlen, double fdr_limit, double *alpha_est_ptr, double *beta_est_ptr) { ctx_assert(arrlen >= 10); ctx_assert2(0 < fdr_limit && fdr_limit < 1, "expected 0 < FDR < 1: %f", fdr_limit); ctx_assert2(kmer_covg[0] == 0, "Shouldn't see any kmers with coverage zero"); size_t i, min_a_est_idx = 0; double r1, r2, rr, min_a_est = DBL_MAX, tmp; double aa, faa, a_est, b_est, c0; r1 = (double)kmer_covg[2] / kmer_covg[1]; r2 = (double)kmer_covg[3] / kmer_covg[2]; rr = r2 / r1; // printf("r1: %.2f r2: %.2f rr: %.2f\n", r1, r2, rr); // iterate aa = { 0.01, 0.02, ..., 1.99, 2.00 } // find aa value that minimises abs(faa-rr) for(i = 1; i <= 200; i++) { aa = i*0.01; faa = tgamma(aa)*tgamma(aa+2) / (2*pow(tgamma(aa+1),2)); tmp = fabs(faa-rr); if(tmp < min_a_est) { min_a_est = tmp; min_a_est_idx = i; } } // a_est, b_est are estimates for alpha, beta of gamma distribution a_est = min_a_est_idx*0.01; b_est = tgamma(a_est + 1.0) / (r1 * tgamma(a_est)) - 1.0; b_est = MAX2(b_est, 0.000001); // Avoid negative beta c0 = kmer_covg[1] * pow(b_est/(1+b_est),-a_est); if(alpha_est_ptr) *alpha_est_ptr = a_est; if(beta_est_ptr) *beta_est_ptr = b_est; // printf("min_a_est_idx: %zu\n", min_a_est_idx); // printf("a_est: %f b_est %f c0: %f\n", a_est, b_est, c0); // Initialise fdr to be greater than fdr_limit double e_cov, e_cov_c0, fdr = 2.0, log_b_est, log_one_plus_b_est, lgamma_a_est; // Calculate some values here for speed log_b_est = log(b_est); log_one_plus_b_est = log(1 + b_est); lgamma_a_est = lgamma(a_est); // note: lfactorial(x) = lgamma(x+1) for(i = 0; i < arrlen; i++) { e_cov = a_est * log_b_est - lgamma_a_est - lgamma(i) + lgamma(a_est + i - 1) - (a_est + i - 1) * log_one_plus_b_est; e_cov_c0 = exp(e_cov) * c0; fdr = 1.0 - (kmer_covg[i] - e_cov_c0) / kmer_covg[i]; // printf("i: %zu e_cov: %f e_cov_c0: %f fdr: %f limit %f\n", // i, e_cov, e_cov_c0, fdr, fdr_limit); if(fdr < fdr_limit) break; } size_t cutoff = i; // Check cutoff is below mean kmer coverage uint64_t kmers_below = 0, kmers_above = 0; for(i = 0; i < cutoff; i++) kmers_below += kmer_covg[i]*i; for(i = cutoff; i < arrlen; i++) kmers_above += kmer_covg[i]*i; // At least 20% of kmers should be kept bool good_cutoff = ((double)kmers_above/(kmers_below+kmers_above) >= 0.2); // printf(" cutoff: %i fdr: %f fdr_limit: %f meankcovg: %f good: %i\n", // cutoff, fdr, fdr_limit, (double)sum/totalkmers, (int)good_cutoff); return fdr < fdr_limit && good_cutoff ? (int)cutoff : -1; }
void assemble_contigs_stats_print(const AssembleContigStats *s) { ctx_assert(s->lengths.len == s->junctns.len); ctx_assert(s->lengths.len == s->num_contigs); size_t i, ncontigs = s->num_contigs; if(ncontigs == 0) { status("[asm] No contigs assembled"); return; } qsort(s->lengths.b, ncontigs, sizeof(s->lengths.b[0]), cmp_size); qsort(s->junctns.b, ncontigs, sizeof(s->junctns.b[0]), cmp_size); size_t len_n50, jnc_n50; size_t len_median, jnc_median, len_mean, jnc_mean; size_t len_min, len_max, jnc_min, jnc_max; // Calculate N50s len_n50 = calc_N50(s->lengths.b, ncontigs, s->total_len); jnc_n50 = calc_N50(s->junctns.b, ncontigs, s->total_junc); // Calculate medians, means len_median = MEDIAN(s->lengths.b, ncontigs); jnc_median = MEDIAN(s->junctns.b, ncontigs); len_mean = (double)s->total_len / ncontigs; jnc_mean = (double)s->total_junc / ncontigs; // Calculate min, max len_min = s->lengths.b[0]; jnc_min = s->junctns.b[0]; len_max = s->lengths.b[ncontigs-1]; jnc_max = s->junctns.b[ncontigs-1]; // Print number of contigs char num_contigs_str[50], reseed_str[50], seed_not_fnd_str[50]; char seed_kmers_str[50], seed_paths_str[50]; long_to_str(ncontigs, num_contigs_str); long_to_str(s->num_reseed_abort, reseed_str); long_to_str(s->num_seeds_not_found, seed_not_fnd_str); long_to_str(s->num_contigs_from_seed_kmers, seed_kmers_str); long_to_str(s->num_contigs_from_seed_paths, seed_paths_str); status(PREFIX"pulled out %s contigs, %s from seed kmers, %s from seed paths", num_contigs_str, seed_kmers_str, seed_paths_str); status(PREFIX"no-reseed aborted %s times", reseed_str); status(PREFIX"seed kmer not found %s times", seed_not_fnd_str); char len_min_str[50], len_max_str[50], len_total_str[50]; char len_mean_str[50], len_median_str[50], len_n50_str[50]; char jnc_min_str[50], jnc_max_str[50], jnc_total_str[50]; char jnc_mean_str[50], jnc_median_str[50], jnc_n50_str[50]; // Use ulong_to_str instead of num_to_str to get better accuracy // e.g. 966 instead of 1K ulong_to_str(len_mean, len_mean_str); ulong_to_str(jnc_mean, jnc_mean_str); ulong_to_str(len_median, len_median_str); ulong_to_str(jnc_median, jnc_median_str); ulong_to_str(len_n50, len_n50_str); ulong_to_str(jnc_n50, jnc_n50_str); ulong_to_str(len_min, len_min_str); ulong_to_str(jnc_min, jnc_min_str); ulong_to_str(len_max, len_max_str); ulong_to_str(jnc_max, jnc_max_str); ulong_to_str(s->total_len, len_total_str); ulong_to_str(s->total_junc, jnc_total_str); status(PREFIX"Lengths: mean: %s median: %s N50: %s min: %s max: %s total: %s [kmers]", len_mean_str, len_median_str, len_n50_str, len_min_str, len_max_str, len_total_str); status(PREFIX"Junctions: mean: %s median: %s N50: %s min: %s max: %s total: %s [out >1]", jnc_mean_str, jnc_median_str, jnc_n50_str, jnc_min_str, jnc_max_str, jnc_total_str); status(PREFIX"Max junction density: %.2f\n", s->max_junc_density); timestamp(); message(PREFIX" Outdegree: "); char nout_str[50]; for(i = 0; i <= 4; i++) { message("\t%zu:%s [%zu%%]", i, ulong_to_str(s->contigs_outdegree[i], nout_str), (size_t)((100.0*s->contigs_outdegree[i])/(2.0*ncontigs)+0.5)); } message("\n"); _print_path_dist(s->paths_held, AC_MAX_PATHS, "Paths held", ncontigs); _print_path_dist(s->paths_cntr, AC_MAX_PATHS, "Paths counter", ncontigs); const uint64_t *states = s->grphwlk_steps; size_t nsteps = s->total_len - s->num_contigs, ncontigends = 2*s->num_contigs; status(PREFIX"Traversal succeeded because:"); _print_grphwlk_state("Pop straight ......... ", states[GRPHWLK_POPFWD], nsteps); _print_grphwlk_state("Col straight ......... ", states[GRPHWLK_COLFWD], nsteps); _print_grphwlk_state("PopFork use colour ... ", states[GRPHWLK_POPFRK_COLFWD],nsteps); _print_grphwlk_state("Go paths ............. ", states[GRPHWLK_USEPATH], nsteps); const uint64_t *stops = s->stop_causes; status(PREFIX"Traversal halted because:"); _print_grphwlk_state("No coverage .......... ", stops[ASSEM_STOP_NOCOVG], ncontigends); _print_grphwlk_state("No colour covg ....... ", stops[ASSEM_STOP_NOCOLCOVG], ncontigends); _print_grphwlk_state("No paths ............. ", stops[ASSEM_STOP_NOPATHS], ncontigends); _print_grphwlk_state("Paths split .......... ", stops[ASSEM_STOP_SPLIT_PATHS], ncontigends); _print_grphwlk_state("Missing paths ........ ", stops[ASSEM_STOP_MISSING_PATHS], ncontigends); _print_grphwlk_state("Graph cycles ......... ", stops[ASSEM_STOP_CYCLE], ncontigends); _print_grphwlk_state("Low step confidence .. ", stops[ASSEM_STOP_LOW_STEP_CONF], ncontigends); _print_grphwlk_state("Low cumul. confidence ", stops[ASSEM_STOP_LOW_CUMUL_CONF],ncontigends); size_t njunc = states[GRPHWLK_USEPATH] + stops[ASSEM_STOP_NOPATHS] + stops[ASSEM_STOP_SPLIT_PATHS] + stops[ASSEM_STOP_MISSING_PATHS]; ctx_assert2(s->total_junc == states[GRPHWLK_USEPATH], "%zu vs %zu", (size_t)s->total_junc, (size_t)states[GRPHWLK_USEPATH]); status(PREFIX"Junctions:"); _print_grphwlk_state("Paths resolved", states[GRPHWLK_USEPATH], njunc); }
int ctx_links(int argc, char **argv) { size_t limit = 0; const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL; const char *thresh_path = NULL, *hist_path = NULL; size_t hist_distsize = 0, hist_covgsize = 0; size_t cutoff = 0; bool clean = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break; case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break; case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break; case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break; case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break; case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break; case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break; case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist"); if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist"); // Defaults if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST; if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG; if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments"); const char *ctp_path = argv[optind]; bool list = (csv_out_path != NULL); bool plot = (plot_out_path != NULL); bool save = (link_out_path != NULL); bool hist_covg = (thresh_path != NULL || hist_path != NULL); size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1); if(clean && !save) cmd_print_usage("Need to give --out <out.ctp.gz> with --clean"); if(!save && !list && !plot && !hist_covg) cmd_print_usage("Please specify one of --plot, --list or --clean"); if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0) cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!"); // Open input file FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL; FILE *thresh_fh = NULL, *hist_fh = NULL; gzFile link_gz = NULL; // Check file don't exist or that we can overwrite // Will ignore if path is null bool err = false; err |= futil_check_outfile(csv_out_path); err |= futil_check_outfile(plot_out_path); err |= futil_check_outfile(link_out_path); err |= futil_check_outfile(thresh_path); err |= futil_check_outfile(hist_path); if(err) die("Use -f,--force to overwrite files"); StrBuf link_tmp_path; strbuf_alloc(&link_tmp_path, 1024); GPathReader ctpin; memset(&ctpin, 0, sizeof(ctpin)); gpath_reader_open(&ctpin, ctp_path); size_t ncols = file_filter_into_ncols(&ctpin.fltr); size_t kmer_size = gpath_reader_get_kmer_size(&ctpin); cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1); if(ncols != 1) die("Can only clean a single colour at a time. Sorry."); uint64_t (*hists)[hist_covgsize] = NULL; if(hist_covg) { hists = ctx_calloc(hist_distsize, sizeof(hists[0])); } if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL) die("Cannot open file: %s", hist_path); if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL) die("Cannot open file: %s", thresh_path); if(limit) status("Limiting to the first %zu kmers", limit); if(clean) { timestamp(); message(" Cleaning coverage below %zu", cutoff); message("\n"); } if(save) { // Check we can find the fields we need cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); if(!nkmers_json || !nlinks_json || !nbytes_json) die("Cannot find required header entries"); // Create a random temporary file link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path); status("Saving output to: %s", link_out_path); status("Temporary output: %s", link_tmp_path.b); // Open output file if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL) die("Cannot open output link file: %s", link_out_path); // Need to open output file first so we can get absolute path // Update the header to include this command json_hdr_add_curr_cmd(newhdr, link_out_path); } if(list) { status("Listing to %s", csv_out_path); if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL) die("Cannot open output CSV file %s", csv_out_path); // Print csv header fprintf(list_fh, "SeqLen,Covg\n"); } if(plot) { status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path); if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL) die("Cannot open output .dot file %s", plot_out_path); } SizeBuffer countbuf, jposbuf; size_buf_alloc(&countbuf, 16); size_buf_alloc(&jposbuf, 1024); StrBuf kmerbuf, juncsbuf, seqbuf, outbuf; strbuf_alloc(&kmerbuf, 1024); strbuf_alloc(&juncsbuf, 1024); strbuf_alloc(&seqbuf, 1024); strbuf_alloc(&outbuf, 1024); bool link_fw; size_t njuncs; size_t knum, nlinks, num_links_exp = 0; LinkTree ltree; ltree_alloc(<ree, kmer_size); LinkTreeStats tree_stats; memset(&tree_stats, 0, sizeof(tree_stats)); size_t init_num_links = 0, num_links = 0; for(knum = 0; !limit || knum < limit; knum++) { ltree_reset(<ree); if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break; ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu", kmerbuf.end, kmer_size); // status("kmer: %s", kmerbuf.b); for(nlinks = 0; gpath_reader_read_link(&ctpin, &link_fw, &njuncs, &countbuf, &juncsbuf, &seqbuf, &jposbuf); nlinks++) { ltree_add(<ree, link_fw, countbuf.b[0], jposbuf.b, juncsbuf.b, seqbuf.b); } if(nlinks != num_links_exp) warn("Links count mismatch %zu != %zu", nlinks, num_links_exp); if(hist_covg) { ltree_update_covg_hists(<ree, (uint64_t*)hists, hist_distsize, hist_covgsize); } if(clean) { ltree_clean(<ree, cutoff); } // Accumulate statistics ltree_get_stats(<ree, &tree_stats); num_links = tree_stats.num_links - init_num_links; init_num_links = tree_stats.num_links; if(list) { ltree_write_list(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end) die("Cannot write CSV file to: %s", csv_out_path); strbuf_reset(&outbuf); } if(save && num_links) { ltree_write_ctp(<ree, kmerbuf.b, num_links, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end) die("Cannot write ctp file to: %s", link_tmp_path.b); strbuf_reset(&outbuf); } if(plot && knum == plot_kmer_idx) { status("Plotting tree..."); ltree_write_dot(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end) die("Cannot write plot DOT file to: %s", plot_out_path); strbuf_reset(&outbuf); } } gpath_reader_close(&ctpin); cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links); status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links); status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes); if(save) { // Update JSON nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links; nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links; nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes; char *json_str = cJSON_Print(newhdr); if(gzputs(link_gz, json_str) != (int)strlen(json_str)) die("Cannot write ctp file to: %s", link_out_path); free(json_str); gzputs(link_gz, "\n\n"); gzputs(link_gz, ctp_explanation_comment); gzputs(link_gz, "\n"); fseek(link_tmp_fh, 0, SEEK_SET); char *tmp = ctx_malloc(4*ONE_MEGABYTE); size_t s; while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) { if(gzwrite(link_gz, tmp, s) != (int)s) die("Cannot write to output: %s", link_out_path); } ctx_free(tmp); gzclose(link_gz); fclose(link_tmp_fh); } // Write histogram to file if(hist_fh) { size_t i, j; fprintf(hist_fh, " "); for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j); fprintf(hist_fh, "\n"); for(i = 1; i < hist_distsize; i++) { fprintf(hist_fh, "dist.%02zu", i); for(j = 1; j < hist_covgsize; j++) { fprintf(hist_fh, ",%"PRIu64, hists[i][j]); } fprintf(hist_fh, "\n"); } } if(thresh_fh) { // Use median of first five cutoffs print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh); } if(hist_fh && hist_fh != stdout) fclose(hist_fh); if(list) { fclose(list_fh); } if(plot) { fclose(plot_fh); } ctx_free(hists); cJSON_Delete(newhdr); strbuf_dealloc(&link_tmp_path); ltree_dealloc(<ree); size_buf_dealloc(&countbuf); size_buf_dealloc(&jposbuf); strbuf_dealloc(&kmerbuf); strbuf_dealloc(&juncsbuf); strbuf_dealloc(&seqbuf); strbuf_dealloc(&outbuf); return EXIT_SUCCESS; }
static inline int test_statement_node(dBNode node, ExpABCWorker *wrkr) { const dBGraph *db_graph = wrkr->db_graph; dBNodeBuffer *nbuf = &wrkr->nbuf; GraphWalker *wlk = &wrkr->gwlk; RepeatWalker *rpt = &wrkr->rptwlk; size_t b_idx, col = wrkr->colour; // rpt_walker_clear(rpt); db_node_buf_reset(nbuf); db_node_buf_add(nbuf, node); // size_t AB_limit = wrkr->prime_AB ? SIZE_MAX : wrkr->max_AB_dist; size_t walk_limit = wrkr->max_AB_dist; // status("walk_limit: %zu", walk_limit); // Walk from B to find A graph_walker_setup(wlk, true, col, col, db_graph); graph_walker_start(wlk, nbuf->b[0]); while(graph_walker_next(wlk) && nbuf->len < walk_limit) { if(!rpt_walker_attempt_traverse(rpt, wlk)) { reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT; } db_node_buf_add(nbuf, wlk->node); } reset(wlk,rpt,nbuf); if(nbuf->len == 1) return RES_NO_TRAVERSAL; // Traverse A->B db_nodes_reverse_complement(nbuf->b, nbuf->len); b_idx = nbuf->len - 1; if(wrkr->prime_AB) { // Prime A->B without attempting to cross graph_walker_prime(wlk, nbuf->b, nbuf->len, nbuf->len, true); while(graph_walker_next(wlk)) { if(!rpt_walker_attempt_traverse(rpt, wlk)) { reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT; } db_node_buf_add(nbuf, wlk->node); } } else { // Attempt to traverse A->B then extend past B int r = confirm_seq(0, true, wlk, rpt, nbuf, col, db_graph); switch(r) { case CONFIRM_REPEAT: return RES_LOST_IN_RPT; case CONFIRM_OVERSHOT: ctx_assert2(0,"Can't 'overshoot' when extending"); case CONFIRM_WRONG: return RES_AB_WRONG; case CONFIRM_SHORT: if(wrkr->print_failed_contigs) print_failed(node, nbuf, db_graph, true, wrkr->prime_AB); wrkr->ab_fail_state[wlk->last_step.status]++; return RES_AB_FAILED; } } reset(wlk,rpt,nbuf); if(nbuf->len == b_idx+1) return RES_NO_TRAVERSAL; // Couldn't get past B // Last node is now C // Walk from B... record whether or not we reach C ctx_assert(db_nodes_are_equal(nbuf->b[b_idx], db_node_reverse(node))); int r = confirm_seq(b_idx, false, wlk, rpt, nbuf, col, db_graph); switch(r) { case CONFIRM_REPEAT: return RES_LOST_IN_RPT; case CONFIRM_OVERSHOT: return RES_BC_OVERSHOT; case CONFIRM_WRONG: return RES_BC_WRONG; case CONFIRM_SHORT: if(wrkr->print_failed_contigs) print_failed(node, nbuf, db_graph, false, wrkr->prime_AB); wrkr->bc_fail_state[wlk->last_step.status]++; return RES_BC_FAILED; case CONFIRM_SUCCESS: return RES_ABC_SUCCESS; } die("Shouldn't reach here: r=%i", r); return -1; }