static void gpath_save_thread(void *arg) { GPathSaver *wrkr = (GPathSaver*)arg; const dBGraph *db_graph = wrkr->db_graph; GPathSubset subset; StrBuf sbuf; gpath_subset_alloc(&subset); gpath_subset_init(&subset, &wrkr->db_graph->gpstore.gpset); strbuf_alloc(&sbuf, 2 * DEFAULT_IO_BUFSIZE); dBNodeBuffer nbuf; SizeBuffer jposbuf; db_node_buf_alloc(&nbuf, 1024); size_buf_alloc(&jposbuf, 256); HASH_ITERATE_PART(&db_graph->ht, wrkr->threadid, wrkr->nthreads, _gpath_gzsave_node, &sbuf, &subset, wrkr->save_seq ? &nbuf : NULL, wrkr->save_seq ? &jposbuf : NULL, wrkr->gzout, wrkr->outlock, db_graph); _gpath_save_flush(wrkr->gzout, &sbuf, wrkr->outlock); db_node_buf_dealloc(&nbuf); size_buf_dealloc(&jposbuf); gpath_subset_dealloc(&subset); strbuf_dealloc(&sbuf); }
char * strbuf_double(idnconv_strbuf_t *buf) { /* * Double the size of the buffer of BUF. */ return (strbuf_alloc(buf, buf->size * 2)); }
// Returns number of types. int load_hla_csv(const char *path, char ***bools_ptr, int num_rows) { assert(num_rows > 0); StrBuf line; strbuf_alloc(&line, 1024); FILE *fh = fopen(path, "r"); if(fh == NULL) die("Cannot open file: %s.", path); if(strbuf_readline(&line, fh) == 0) die("Empty CSV file: %s.", path); int num_types = count_char(line.b, ','); char **bools = my_malloc(sizeof(char*) * num_rows, __FILE__, __LINE__); char *data = my_malloc(sizeof(char) * num_rows * (num_types+1), __FILE__, __LINE__); printf("Number of rows: %i.\n",num_rows); int i; for(i = 0; i < num_rows && strbuf_reset_readline(&line, fh); i++) { strbuf_chomp(&line); bools[i] = data + i * (num_types+1); load_comma_bool_line(line.b, bools[i], num_types); bools[i][num_types] = '\0'; } if(i < num_rows) die("Not enough rows in CSV file: %s.", path); fclose(fh); strbuf_dealloc(&line); *bools_ptr = bools; return num_types; }
char * strbuf_copy(idnconv_strbuf_t *buf, const char *str) { /* * Copy STR to BUF. */ size_t len = strlen(str); if (strbuf_alloc(buf, len + 1) == NULL) return (NULL); strcpy(buf->str, str); return (buf->str); }
CallDecomp* call_decomp_init(htsFile *vcffh, bcf_hdr_t *vcfhdr) { CallDecomp *dc = ctx_calloc(1, sizeof(CallDecomp)); dc->nw_aligner = needleman_wunsch_new(); dc->aln = alignment_create(1024); dc->scoring = ctx_calloc(1, sizeof(dc->scoring[0])); scoring_system_default(dc->scoring); dc->vcffh = vcffh; dc->vcfhdr = vcfhdr; dc->v = bcf_init(); strbuf_alloc(&dc->sbuf, 256); return dc; }
char * strbuf_append(idnconv_strbuf_t *buf, const char *str) { /* * Append STR to the end of BUF. */ size_t len1 = strlen(buf->str); size_t len2 = strlen(str); char *p; #define MARGIN 50 p = strbuf_alloc(buf, len1 + len2 + 1 + MARGIN); if (p != NULL) strcpy(buf->str + len1, str); return (p); }
void vcf_misc_hdr_add_cmd(bcf_hdr_t *hdr, const char *cmdline, const char *cwd) { char keystr[8], timestr[100]; time_t tnow; time(&tnow); strftime(timestr, sizeof(timestr), "%Y%m%d-%H:%M:%S", localtime(&tnow)); StrBuf sbuf; strbuf_alloc(&sbuf, 1024); strbuf_sprintf(&sbuf, "##mccortex_%s=<prev=\"NULL\",cmd=\"%s\",cwd=\"%s\"," "datetime=\"%s\",version="CTX_VERSION">\n", hex_rand_str(keystr, sizeof(keystr)), cmdline, cwd, timestr); bcf_hdr_append(hdr, sbuf.b); strbuf_dealloc(&sbuf); }
static cJSON* read_input_header(gzFile gzin) { cJSON *json; StrBuf hdrstr; strbuf_alloc(&hdrstr, 1024); json_hdr_read(NULL, gzin, input_path, &hdrstr); json = cJSON_Parse(hdrstr.b); if(json == NULL) die("Invalid JSON header: %s", input_path); // Check we can handle the kmer size kmer_size = json_hdr_get_kmer_size(json, input_path); db_graph_check_kmer_size(kmer_size, input_path); strbuf_dealloc(&hdrstr); return json; }
void test_graph_crawler() { test_status("Testing graph crawler..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; db_graph_alloc(&graph, kmer_size, ncols, 1, 2048, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); char graphseq[3][77] = // < X X X............... {"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC", "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA", "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"}; build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0])); build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1])); build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2])); // Crawl graph GraphCrawler crawler; graph_crawler_alloc(&crawler, &graph); dBNode node = db_graph_find_str(&graph, graphseq[0]); dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1); TASSERT(node.key != HASH_NOT_FOUND); TASSERT(next_node.key != HASH_NOT_FOUND); BinaryKmer bkey = db_node_get_bkmer(&graph, node.key); Edges edges = db_node_get_edges(&graph, node.key, 0); dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, p, num_next, next_idx; num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges, next_nodes, next_nucs); next_idx = 0; while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node)) next_idx++; TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node)); // Crawl in all colours graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next, NULL, graph.num_of_cols, NULL, NULL, NULL); TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths); // Fetch paths dBNodeBuffer nbuf; db_node_buf_alloc(&nbuf, 16); StrBuf sbuf; strbuf_alloc(&sbuf, 128); for(p = 0; p < crawler.num_paths; p++) { db_node_buf_reset(&nbuf); graph_crawler_get_path_nodes(&crawler, p, &nbuf); strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size); sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b); for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {} TASSERT2(i < 3, "seq: %s", sbuf.b); TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end); TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len); } strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); graph_crawler_dealloc(&crawler); db_graph_dealloc(&graph); }
void graph_info_alloc(GraphInfo *ginfo) { strbuf_alloc(&ginfo->sample_name, 256); error_cleaning_alloc(&ginfo->cleaning); graph_info_init(ginfo); }
static void error_cleaning_alloc(ErrorCleaning *ec) { strbuf_alloc(&ec->intersection_name, 256); error_cleaning_init(ec); }
static void parse_entries(gzFile gzin, FILE *fout) { CallFileEntry centry; call_file_entry_alloc(¢ry); ChromPosBuffer chrposbuf; chrompos_buf_alloc(&chrposbuf, 32); StrBuf tmpbuf, flank3pbuf; strbuf_alloc(&tmpbuf, 1024); strbuf_alloc(&flank3pbuf, 1024); const char *flank5p, *flank3p; size_t flank5p_len, flank3p_len; size_t cpy_flnk_5p, cpy_flnk_3p; const read_t *chrom = NULL; size_t ref_start = 0, ref_end = 0; bool mapped = false, fw_strand = false; const char **genotypes = NULL; if(!input_bubble_format) genotypes = ctx_calloc(num_samples, sizeof(char*)); for(; call_file_read(gzin, input_path, ¢ry); num_entries_read++) { size_t nlines = call_file_num_lines(¢ry); ctx_assert2(!(nlines&1) && nlines >= 6, "Too few lines: %zu", nlines); flank5p = call_file_get_line(¢ry,1); flank5p_len = call_file_line_len(¢ry,1); cpy_flnk_5p = cpy_flnk_3p = 0; // Read a corresponding SAM entry if(input_bubble_format) { // Trim down alleles, add to 3p flank bubble_trim_alleles(¢ry, &flank3pbuf); flank3p = flank3pbuf.b; flank3p_len = flank3pbuf.end; mapped = sam_fetch_coords(¢ry, flank5p, flank5p_len, flank3p, flank3p_len, &cpy_flnk_5p, &cpy_flnk_3p, &chrom, &ref_start, &ref_end, &fw_strand); } else { flank3p = call_file_get_line(¢ry, 3); flank3p_len = call_file_line_len(¢ry, 3); mapped = brkpnt_fetch_coords(¢ry, &chrposbuf, &chrom, &ref_start, &ref_end, &fw_strand, &cpy_flnk_5p, &cpy_flnk_3p); } if(mapped) { // Get call id const char *hdrline = call_file_get_line(¢ry, 0); char callid[100]; int r = get_callid_str(hdrline, input_bubble_format, callid, sizeof(callid)); if(r == -1) die("Poorly formatted: %s", hdrline); if(r == -2) die("Call id string is too long: %s", hdrline); align_entry(¢ry, callid, flank5p, flank5p_len, flank3p, flank3p_len, cpy_flnk_5p, cpy_flnk_3p, chrom, ref_start, ref_end, fw_strand, &tmpbuf, genotypes, fout); } } ctx_free(genotypes); call_file_entry_dealloc(¢ry); chrompos_buf_dealloc(&chrposbuf); strbuf_dealloc(&tmpbuf); strbuf_dealloc(&flank3pbuf); }
int main(int argc, char **argv) { // compiler complains about unused function without these linese (void)kh_clear_ghash; (void)kh_del_ghash; if(argc < 2) print_usage(usage, NULL); char swap_alleles = 0; int c; while((c = getopt(argc, argv, "s")) >= 0) { switch (c) { case 's': swap_alleles = 1; break; default: die("Unknown option: %c", c); } } if(optind == argc) print_usage(usage, "Not enough arguments"); char *inputpath = argv[optind]; char **refpaths = argv + optind + 1; size_t num_refs = argc - optind - 1; gzFile gzin = gzopen(inputpath, "r"); if(gzin == NULL) die("Cannot read file: %s", inputpath); size_t i, nchroms = 0, capacity = 1024; khash_t(ghash) *genome = kh_init(ghash); read_t *reads = malloc(capacity * sizeof(read_t)), *r; int hret; khiter_t k; for(i = 0; i < num_refs; i++) { fprintf(stderr, "Loading %s\n", refpaths[i]); load_reads(refpaths[i], &reads, &capacity, &nchroms); } if(num_refs == 0) { fprintf(stderr, "Loading from stdin\n"); load_reads("-", &reads, &capacity, &nchroms); } if(nchroms == 0) die("No chromosomes loaded"); for(i = 0; i < nchroms; i++) { r = reads + i; fprintf(stderr, "Loaded: '%s'\n", r->name.b); k = kh_put(ghash, genome, r->name.b, &hret); if(hret == 0) warn("Duplicate read name (taking first): %s", r->name.b); else kh_value(genome, k) = r; } // Now read VCF StrBuf line; strbuf_alloc(&line, 1024); char *fields[9]; char *chr; int pos, reflen, altlen; while(strbuf_reset_gzreadline(&line, gzin) > 0) { if(line.b[0] == '#') fputs(line.b, stdout); else { strbuf_chomp(&line); vcf_columns(line.b, fields); fields[1][-1] = fields[2][-1] = '\0'; chr = line.b; pos = atoi(fields[1])-1; k = kh_get(ghash, genome, chr); r = kh_value(genome, k); fields[1][-1] = fields[2][-1] = '\t'; reflen = fields[4] - fields[3] - 1; altlen = fields[5] - fields[4] - 1; if(k == kh_end(genome)) warn("Cannot find chrom: %s", chr); else if(pos < 0) warn("Bad line: %s\n", line.b); else if((reflen == 1 && altlen == 1) || fields[3][0] == fields[4][0]) { if((unsigned)pos + reflen <= r->seq.end && strncasecmp(r->seq.b+pos,fields[3],reflen) == 0) { fputs(line.b, stdout); fputc('\n', stdout); } else if(swap_alleles && (unsigned)pos + altlen <= r->seq.end && strncasecmp(r->seq.b+pos,fields[4],altlen) == 0) { // swap alleles char tmp[altlen], *ref = fields[3], *alt = fields[4]; memcpy(tmp, alt, altlen); memmove(ref+altlen+1, ref, reflen); memcpy(ref, tmp, altlen); ref[altlen] = '\t'; fputs(line.b, stdout); fputc('\n', stdout); } // else printf("FAIL0\n"); } // else printf("FAIL1\n"); } } kh_destroy(ghash, genome); strbuf_dealloc(&line); gzclose(gzin); for(i = 0; i < nchroms; i++) seq_read_dealloc(reads+i); free(reads); fprintf(stderr, " Done.\n"); return 0; }
BubbleCaller* bubble_callers_new(size_t num_callers, BubbleCallingPrefs prefs, gzFile gzout, const dBGraph *db_graph) { ctx_assert(num_callers > 0); // Max usage is 4 * max_allele_len * cols size_t i; size_t max_path_len = MAX2(prefs.max_flank_len, prefs.max_allele_len); BubbleCaller *callers = ctx_malloc(num_callers * sizeof(BubbleCaller)); pthread_mutex_t *out_lock = ctx_malloc(sizeof(pthread_mutex_t)); if(pthread_mutex_init(out_lock, NULL) != 0) die("mutex init failed"); size_t *num_bubbles_ptr = ctx_calloc(1, sizeof(size_t)); for(i = 0; i < num_callers; i++) { BubbleCaller tmp = {.threadid = i, .nthreads = num_callers, .haploid_seen = ctx_calloc(1+prefs.num_haploid, sizeof(bool)), .num_bubbles_ptr = num_bubbles_ptr, .prefs = prefs, .db_graph = db_graph, .gzout = gzout, .out_lock = out_lock}; memcpy(&callers[i], &tmp, sizeof(BubbleCaller)); // First two buffers don't actually need to grow db_node_buf_alloc(&callers[i].flank5p, prefs.max_flank_len); db_node_buf_alloc(&callers[i].pathbuf, max_path_len); graph_walker_alloc(&callers[i].wlk, db_graph); rpt_walker_alloc(&callers[i].rptwlk, db_graph->ht.capacity, 22); // 4MB graph_cache_alloc(&callers[i].cache, db_graph); cache_stepptr_buf_alloc(&callers[i].spp_forward, 1024); cache_stepptr_buf_alloc(&callers[i].spp_reverse, 1024); strbuf_alloc(&callers[i].output_buf, 2048); } return callers; } void bubble_callers_destroy(BubbleCaller *callers, size_t num_callers) { ctx_assert(num_callers > 0); size_t i; for(i = 0; i < num_callers; i++) { ctx_free(callers[i].haploid_seen); db_node_buf_dealloc(&callers[i].flank5p); db_node_buf_dealloc(&callers[i].pathbuf); rpt_walker_dealloc(&callers[i].rptwlk); graph_walker_dealloc(&callers[i].wlk); graph_cache_dealloc(&callers[i].cache); cache_stepptr_buf_dealloc(&callers[i].spp_forward); cache_stepptr_buf_dealloc(&callers[i].spp_reverse); strbuf_dealloc(&callers[i].output_buf); } pthread_mutex_destroy(callers[0].out_lock); ctx_free(callers[0].out_lock); ctx_free(callers[0].num_bubbles_ptr); ctx_free(callers); }
// Load each sequence into a separate colour static void test_bubbles(dBGraph *graph, const char **seqs, size_t nseqs, const char *flank5p, const char *flank3p, const char **alleles, size_t nalleles) { db_graph_reset(graph); TASSERT(graph->num_of_cols >= nseqs); size_t i; for(i = 0; i < nseqs; i++) build_graph_from_str_mt(graph, i, seqs[i], strlen(seqs[i]), false); graph->num_of_cols_used = MAX2(graph->num_of_cols_used, 1); StrBuf sbuf; dBNodeBuffer nbuf; strbuf_alloc(&sbuf, 128); db_node_buf_alloc(&nbuf, 128); BubbleCallingPrefs prefs = {.max_allele_len = 100, .max_flank_len = 100, .haploid_cols = NULL, .nhaploid_cols = 0, .remove_serial_bubbles = true}; BubbleCaller *caller = bubble_callers_new(1, &prefs, NULL, graph); _call_bubble(caller, flank5p, flank3p, alleles, nalleles, &nbuf, &sbuf); strbuf_dealloc(&sbuf); db_node_buf_dealloc(&nbuf); bubble_callers_destroy(caller, 1); } void test_bubble_caller() { test_status("Testing bubble calling..."); // Construct 1 colour graph with kmer-size=11 dBGraph graph; const size_t kmer_size = 11, ncols = 3; // Create graph db_graph_alloc(&graph, kmer_size, ncols, 1, 2000, DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS); // mutations: x const char *seqs0[] = {"AGGGATAAAACTCTGTACTGGATCTCCCT", "AGGGATAAAACTCTcTACTGGATCTCCCT"}; const char flank5p0[] = "AGGGATAAAACTCT"; const char flank3p0[] = "TACTGGATCTCCCT"; const char *alleles0[] = {"ATAAAACTCTGTACTGGATCT", "ATAAAACTCTcTACTGGATCT"}; test_bubbles(&graph, seqs0, 2, flank5p0, flank3p0, alleles0, 2); // mutations: x y const char *seqs1[] = {"CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA", "CCCGTAGGTAAGtGCGTTAGTGCAAGGCCACATTGGGACACGAGTTGATA", "CCCGTAGGTAAGGGCGTTAGTGCAAGGCCACtTTGGGACACGAGTTGATA"}; // forwards const char flank5p1a[] = "CCCGTAGGTAAG"; const char flank3p1a[] = "GCGTTAGTGCAAGGCCAC"; const char *alleles1a[] = {"CGTAGGTAAGGGCGTTAGTGC", "CGTAGGTAAGtGCGTTAGTGC"}; const char flank5p1b[] = "GCGTTAGTGCAAGGCCAC"; const char flank3p1b[] = "TTGGGACACGAGTTGATA"; const char *alleles1b[] = {"GCAAGGCCACATTGGGACACG", "GCAAGGCCACtTTGGGACACG"}; test_bubbles(&graph, seqs1, 3, flank5p1a, flank3p1a, alleles1a, 2); test_bubbles(&graph, seqs1, 3, flank5p1b, flank3p1b, alleles1b, 2); // reverse // mutations: y x // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCCCTTACCTACGGG // TATCAACTCGTGTCCCAATGTGGCCTTGCACTAACGCaCTTACCTACGGG // TATCAACTCGTGTCCCAAaGTGGCCTTGCACTAACGCCCTTACCTACGGG // const char flank5p1c[] = "GTGGCCTTGCACTAACGC"; const char flank3p1c[] = "CTTACCTACGGG"; const char *alleles1c[] = {"GCACTAACGCCCTTACCTACG", "GCACTAACGCaCTTACCTACG"}; const char flank5p1d[] = "TATCAACTCGTGTCCCAA"; const char flank3p1d[] = "GTGGCCTTGCACTAACGC"; const char *alleles1d[] = {"CGTGTCCCAATGTGGCCTTGC", "CGTGTCCCAAaGTGGCCTTGC"}; test_bubbles(&graph, seqs1, 3, flank5p1c, flank3p1c, alleles1c, 2); test_bubbles(&graph, seqs1, 3, flank5p1d, flank3p1d, alleles1d, 2); db_graph_dealloc(&graph); }
static bcf_hdr_t* make_vcf_hdr(cJSON *json, const char *in_path, bool is_breakpoint, size_t kmer_size, char const*const* ref_paths, size_t nref_paths, read_t *chroms, size_t nchroms) { ctx_assert(json != NULL); StrBuf hdrbuf; strbuf_alloc(&hdrbuf, 1024); char datestr[9]; time_t date = time(NULL); strftime(datestr, 9, "%Y%m%d", localtime(&date)); strbuf_append_str(&hdrbuf, "##fileformat=VCFv4.2\n##fileDate="); strbuf_append_str(&hdrbuf, datestr); strbuf_append_str(&hdrbuf, "\n"); // Print commands used to generate header cJSON *commands = json_hdr_get(json, "commands", cJSON_Array, in_path); cJSON *command = commands->child; // Print this command char keystr[8]; char *prevstr = NULL; size_t i; if(command) { cJSON *key = json_hdr_get(command, "key", cJSON_String, in_path); prevstr = key->valuestring; } // Print command entry for this command strbuf_append_str(&hdrbuf, "##mccortex_"); strbuf_append_str(&hdrbuf, hex_rand_str(keystr, sizeof(keystr))); strbuf_append_str(&hdrbuf, "=<prev=\""); strbuf_append_str(&hdrbuf, prevstr ? prevstr : "NULL"); strbuf_append_str(&hdrbuf, "\",cmd=\""); strbuf_append_str(&hdrbuf, cmd_get_cmdline()); strbuf_append_str(&hdrbuf, "\",cwd=\""); strbuf_append_str(&hdrbuf, cmd_get_cwd()); strbuf_append_str(&hdrbuf, "\",version="CTX_VERSION">\n"); // Print previous commands vcf_hdrtxt_append_commands(command, &hdrbuf, in_path); // Print field definitions if(is_breakpoint) strbuf_append_str(&hdrbuf, "##INFO=<ID=BRKPNT,Number=1,Type=String,Description=\"Breakpoint call\">\n"); else strbuf_append_str(&hdrbuf, "##INFO=<ID=BUBBLE,Number=1,Type=String,Description=\"Bubble call\">\n"); strbuf_sprintf(&hdrbuf, "##INFO=<ID=K%zu,Number=0,Type=Flag,Description=\"Found at k=%zu\">\n", kmer_size, kmer_size); strbuf_append_str(&hdrbuf, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"); strbuf_append_str(&hdrbuf, "##FILTER=<ID=PASS,Description=\"All filters passed\">\n"); // Print reference paths strbuf_append_str(&hdrbuf, "##reference="); strbuf_append_str(&hdrbuf, ref_paths[0]); for(i = 1; i < nref_paths; i++) { strbuf_append_char(&hdrbuf, ','); strbuf_append_str(&hdrbuf, ref_paths[i]); } strbuf_append_str(&hdrbuf, "\n"); // Print contigs lengths for(i = 0; i < nchroms; i++) { strbuf_sprintf(&hdrbuf, "##contig=<ID=%s,length=%zu>\n", chroms[i].name.b, chroms[i].seq.end); } // Print VCF column header strbuf_append_str(&hdrbuf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); if(is_breakpoint) { // Print a column for each sample cJSON *graph_json = json_hdr_get(json, "graph", cJSON_Object, in_path); cJSON *colours_json = json_hdr_get(graph_json, "colours", cJSON_Array, in_path); cJSON *colour_json = colours_json->child; if(colour_json == NULL) die("Missing colours"); for(; colour_json; colour_json = colour_json->next) { if(!json_hdr_colour_is_ref(colour_json)) { cJSON *sample_json = json_hdr_get(colour_json, "sample", cJSON_String, in_path); strbuf_append_str(&hdrbuf, "\t"); strbuf_append_str(&hdrbuf, sample_json->valuestring); } } } strbuf_append_char(&hdrbuf, '\n'); bcf_hdr_t *hdr = bcf_hdr_init("w"); if(bcf_hdr_parse(hdr, hdrbuf.b) != 0) die("Cannot construct VCF header"); strbuf_dealloc(&hdrbuf); return hdr; }
int ctx_join(int argc, char **argv) { struct MemArgs memargs = MEM_ARGS_INIT; const char *out_path = NULL; size_t use_ncols = 0; GraphFileReader tmp_gfile; GraphFileBuffer isec_gfiles_buf; gfile_buf_alloc(&isec_gfiles_buf, 8); // Arg parsing char cmd[100], shortopts[100]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'm': cmd_mem_args_set_memory(&memargs, optarg); break; case 'n': cmd_mem_args_set_nkmers(&memargs, optarg); break; case 'N': cmd_check(!use_ncols, cmd); use_ncols = cmd_uint32_nonzero(cmd, optarg); break; case 'i': graph_file_reset(&tmp_gfile); graph_file_open(&tmp_gfile, optarg); if(file_filter_into_ncols(&tmp_gfile.fltr) > 1) warn("Flattening intersection graph into colour 0: %s", optarg); file_filter_flatten(&tmp_gfile.fltr, 0); gfile_buf_push(&isec_gfiles_buf, &tmp_gfile, 1); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" join -h` for help. Bad option: %s", argv[optind-1]); default: abort(); } } GraphFileReader *igfiles = isec_gfiles_buf.b; size_t num_igfiles = isec_gfiles_buf.len; if(!out_path) cmd_print_usage("--out <out.ctx> required"); if(optind >= argc) cmd_print_usage("Please specify at least one input graph file"); // optind .. argend-1 are graphs to load size_t num_gfiles = (size_t)(argc - optind); char **gfile_paths = argv + optind; GraphFileReader *gfiles = ctx_calloc(num_gfiles, sizeof(GraphFileReader)); status("Probing %zu graph files and %zu intersect files", num_gfiles, num_igfiles); // Check all binaries are valid binaries with matching kmer size size_t i; size_t ctx_max_cols = 0; uint64_t min_intersect_num_kmers = 0, ctx_max_kmers = 0, ctx_sum_kmers = 0; for(i = 0; i < num_gfiles; i++) { graph_file_open2(&gfiles[i], gfile_paths[i], "r", true, ctx_max_cols); if(gfiles[0].hdr.kmer_size != gfiles[i].hdr.kmer_size) { cmd_print_usage("Kmer sizes don't match [%u vs %u]", gfiles[0].hdr.kmer_size, gfiles[i].hdr.kmer_size); } ctx_max_cols = MAX2(ctx_max_cols, file_filter_into_ncols(&gfiles[i].fltr)); ctx_max_kmers = MAX2(ctx_max_kmers, graph_file_nkmers(&gfiles[i])); ctx_sum_kmers += graph_file_nkmers(&gfiles[i]); } // Probe intersection graph files for(i = 0; i < num_igfiles; i++) { if(gfiles[0].hdr.kmer_size != igfiles[i].hdr.kmer_size) { cmd_print_usage("Kmer sizes don't match [%u vs %u]", gfiles[0].hdr.kmer_size, igfiles[i].hdr.kmer_size); } uint64_t nkmers = graph_file_nkmers(&igfiles[i]); if(i == 0) min_intersect_num_kmers = nkmers; else if(nkmers < min_intersect_num_kmers) { // Put smallest intersection binary first SWAP(igfiles[i], igfiles[0]); min_intersect_num_kmers = nkmers; } } bool take_intersect = (num_igfiles > 0); // If we are taking an intersection, // all kmers intersection kmers will need to be loaded if(take_intersect) ctx_max_kmers = ctx_sum_kmers = min_intersect_num_kmers; bool use_ncols_set = (use_ncols > 0); bool output_to_stdout = (strcmp(out_path,"-") == 0); // if(use_ncols == 0) use_ncols = 1; if(use_ncols_set) { if(use_ncols < ctx_max_cols && output_to_stdout) die("I need %zu colours if outputting to STDOUT (--ncols)", ctx_max_cols); if(use_ncols > ctx_max_cols) { warn("I only need %zu colour%s ('--ncols %zu' ignored)", ctx_max_cols, util_plural_str(ctx_max_cols), use_ncols); use_ncols = ctx_max_cols; } } else { use_ncols = output_to_stdout ? ctx_max_cols : 1; } // Check out_path is writable futil_create_output(out_path); status("Output %zu cols; from %zu files; intersecting %zu graphs; ", ctx_max_cols, num_gfiles, num_igfiles); if(num_gfiles == 1 && num_igfiles == 0) { // Loading only one file with no intersection files // Don't need to store a graph in memory, can filter as stream // Don't actually store anything in the de Bruijn graph, but we need to // pass it, so mock one up dBGraph db_graph; db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, file_filter_into_ncols(&gfiles[0].fltr), 0, 1024, 0); graph_writer_stream_mkhdr(out_path, &gfiles[0], &db_graph, NULL, NULL); graph_file_close(&gfiles[0]); gfile_buf_dealloc(&isec_gfiles_buf); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; } // // Decide on memory // size_t bits_per_kmer, kmers_in_hash, graph_mem; bits_per_kmer = sizeof(BinaryKmer)*8 + (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols; kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); if(!use_ncols_set) { // Maximise use_ncols size_t max_usencols = (memargs.mem_to_use*8) / bits_per_kmer; use_ncols = MIN2(max_usencols, ctx_max_cols); bits_per_kmer = sizeof(BinaryKmer)*8 + (sizeof(Covg) + sizeof(Edges)) * 8 * use_ncols; // Re-check memory used kmers_in_hash = cmd_get_kmers_in_hash(memargs.mem_to_use, memargs.mem_to_use_set, memargs.num_kmers, memargs.num_kmers_set, bits_per_kmer, ctx_max_kmers, ctx_sum_kmers, true, &graph_mem); } status("Using %zu colour%s in memory", use_ncols, util_plural_str(use_ncols)); cmd_check_mem_limit(memargs.mem_to_use, graph_mem); // Create db_graph dBGraph db_graph; Edges *intersect_edges = NULL; size_t edge_cols = (use_ncols + take_intersect); db_graph_alloc(&db_graph, gfiles[0].hdr.kmer_size, use_ncols, use_ncols, kmers_in_hash, DBG_ALLOC_COVGS); // We allocate edges ourself since it's a special case db_graph.col_edges = ctx_calloc(db_graph.ht.capacity*edge_cols, sizeof(Edges)); // Load intersection binaries char *intsct_gname_ptr = NULL; StrBuf intersect_gname; strbuf_alloc(&intersect_gname, 1024); if(take_intersect) { GraphLoadingPrefs gprefs = graph_loading_prefs(&db_graph); gprefs.boolean_covgs = true; // covg++ only for(i = 0; i < num_igfiles; i++) { graph_load(&igfiles[i], gprefs, NULL); // Update intersect header // note: intersection graphs all load exactly one colour into colour 0 graph_info_make_intersect(&igfiles[i].hdr.ginfo[0], &intersect_gname); gprefs.must_exist_in_graph = true; gprefs.must_exist_in_edges = db_graph.col_edges; } if(num_igfiles > 1) { // Remove nodes where covg != num_igfiles HASH_ITERATE_SAFE(&db_graph.ht, remove_non_intersect_nodes, db_graph.col_covgs, (Covg)num_igfiles, &db_graph.ht); } status("Loaded intersection set\n"); intsct_gname_ptr = intersect_gname.b; for(i = 0; i < num_igfiles; i++) graph_file_close(&igfiles[i]); // Reset graph info for(i = 0; i < db_graph.num_of_cols; i++) graph_info_init(&db_graph.ginfo[i]); // Zero covgs memset(db_graph.col_covgs, 0, db_graph.ht.capacity * sizeof(Covg)); // Use union edges we loaded to intersect new edges intersect_edges = db_graph.col_edges; db_graph.col_edges += db_graph.ht.capacity; } bool kmers_loaded = take_intersect, colours_loaded = false; graph_writer_merge_mkhdr(out_path, gfiles, num_gfiles, kmers_loaded, colours_loaded, intersect_edges, intsct_gname_ptr, &db_graph); if(take_intersect) db_graph.col_edges -= db_graph.ht.capacity; for(i = 0; i < num_gfiles; i++) graph_file_close(&gfiles[i]); strbuf_dealloc(&intersect_gname); gfile_buf_dealloc(&isec_gfiles_buf); ctx_free(gfiles); db_graph_dealloc(&db_graph); return EXIT_SUCCESS; }
int ctx_links(int argc, char **argv) { size_t limit = 0; const char *link_out_path = NULL, *csv_out_path = NULL, *plot_out_path = NULL; const char *thresh_path = NULL, *hist_path = NULL; size_t hist_distsize = 0, hist_covgsize = 0; size_t cutoff = 0; bool clean = false; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!link_out_path, cmd); link_out_path = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'l': cmd_check(!csv_out_path, cmd); csv_out_path = optarg; break; case 'c': cmd_check(!cutoff, cmd); cutoff = cmd_size(cmd, optarg); clean = true; break; case 'L': cmd_check(!limit, cmd); limit = cmd_size(cmd, optarg); break; case 'P': cmd_check(!plot_out_path, cmd); plot_out_path = optarg; break; case 'T': cmd_check(!thresh_path, cmd); thresh_path = optarg; break; case 'H': cmd_check(!hist_path, cmd); hist_path = optarg; break; case 'C': cmd_check(!hist_covgsize, cmd); hist_covgsize = cmd_size(cmd, optarg); break; case 'D': cmd_check(!hist_distsize, cmd); hist_distsize = cmd_size(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ // cmd_print_usage(NULL); die("`"CMD" links -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } if(hist_distsize && !hist_path) cmd_print_usage("--max-dist without --covg-hist"); if(hist_covgsize && !hist_path) cmd_print_usage("--max-covg without --covg-hist"); // Defaults if(!hist_distsize) hist_distsize = DEFAULT_MAX_DIST; if(!hist_covgsize) hist_covgsize = DEFAULT_MAX_COVG; if(optind + 1 != argc) cmd_print_usage("Wrong number of arguments"); const char *ctp_path = argv[optind]; bool list = (csv_out_path != NULL); bool plot = (plot_out_path != NULL); bool save = (link_out_path != NULL); bool hist_covg = (thresh_path != NULL || hist_path != NULL); size_t plot_kmer_idx = (limit == 0 ? 0 : limit - 1); if(clean && !save) cmd_print_usage("Need to give --out <out.ctp.gz> with --clean"); if(!save && !list && !plot && !hist_covg) cmd_print_usage("Please specify one of --plot, --list or --clean"); if(link_out_path && hist_covg && strcmp(link_out_path,"-") == 0) cmd_print_usage("Outputing both cleaning threshold (-T) and links (-o) to STDOUT!"); // Open input file FILE *list_fh = NULL, *plot_fh = NULL, *link_tmp_fh = NULL; FILE *thresh_fh = NULL, *hist_fh = NULL; gzFile link_gz = NULL; // Check file don't exist or that we can overwrite // Will ignore if path is null bool err = false; err |= futil_check_outfile(csv_out_path); err |= futil_check_outfile(plot_out_path); err |= futil_check_outfile(link_out_path); err |= futil_check_outfile(thresh_path); err |= futil_check_outfile(hist_path); if(err) die("Use -f,--force to overwrite files"); StrBuf link_tmp_path; strbuf_alloc(&link_tmp_path, 1024); GPathReader ctpin; memset(&ctpin, 0, sizeof(ctpin)); gpath_reader_open(&ctpin, ctp_path); size_t ncols = file_filter_into_ncols(&ctpin.fltr); size_t kmer_size = gpath_reader_get_kmer_size(&ctpin); cJSON *newhdr = cJSON_Duplicate(ctpin.json, 1); if(ncols != 1) die("Can only clean a single colour at a time. Sorry."); uint64_t (*hists)[hist_covgsize] = NULL; if(hist_covg) { hists = ctx_calloc(hist_distsize, sizeof(hists[0])); } if(hist_path && (hist_fh = futil_fopen_create(hist_path, "w")) == NULL) die("Cannot open file: %s", hist_path); if(thresh_path && (thresh_fh = futil_fopen_create(thresh_path, "w")) == NULL) die("Cannot open file: %s", thresh_path); if(limit) status("Limiting to the first %zu kmers", limit); if(clean) { timestamp(); message(" Cleaning coverage below %zu", cutoff); message("\n"); } if(save) { // Check we can find the fields we need cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); if(!nkmers_json || !nlinks_json || !nbytes_json) die("Cannot find required header entries"); // Create a random temporary file link_tmp_fh = create_tmp_file(&link_tmp_path, link_out_path); status("Saving output to: %s", link_out_path); status("Temporary output: %s", link_tmp_path.b); // Open output file if((link_gz = futil_gzopen_create(link_out_path, "w")) == NULL) die("Cannot open output link file: %s", link_out_path); // Need to open output file first so we can get absolute path // Update the header to include this command json_hdr_add_curr_cmd(newhdr, link_out_path); } if(list) { status("Listing to %s", csv_out_path); if((list_fh = futil_fopen_create(csv_out_path, "w")) == NULL) die("Cannot open output CSV file %s", csv_out_path); // Print csv header fprintf(list_fh, "SeqLen,Covg\n"); } if(plot) { status("Plotting kmer %zu to %s", plot_kmer_idx, plot_out_path); if((plot_fh = futil_fopen_create(plot_out_path, "w")) == NULL) die("Cannot open output .dot file %s", plot_out_path); } SizeBuffer countbuf, jposbuf; size_buf_alloc(&countbuf, 16); size_buf_alloc(&jposbuf, 1024); StrBuf kmerbuf, juncsbuf, seqbuf, outbuf; strbuf_alloc(&kmerbuf, 1024); strbuf_alloc(&juncsbuf, 1024); strbuf_alloc(&seqbuf, 1024); strbuf_alloc(&outbuf, 1024); bool link_fw; size_t njuncs; size_t knum, nlinks, num_links_exp = 0; LinkTree ltree; ltree_alloc(<ree, kmer_size); LinkTreeStats tree_stats; memset(&tree_stats, 0, sizeof(tree_stats)); size_t init_num_links = 0, num_links = 0; for(knum = 0; !limit || knum < limit; knum++) { ltree_reset(<ree); if(!gpath_reader_read_kmer(&ctpin, &kmerbuf, &num_links_exp)) break; ctx_assert2(kmerbuf.end == kmer_size, "Kmer incorrect length %zu != %zu", kmerbuf.end, kmer_size); // status("kmer: %s", kmerbuf.b); for(nlinks = 0; gpath_reader_read_link(&ctpin, &link_fw, &njuncs, &countbuf, &juncsbuf, &seqbuf, &jposbuf); nlinks++) { ltree_add(<ree, link_fw, countbuf.b[0], jposbuf.b, juncsbuf.b, seqbuf.b); } if(nlinks != num_links_exp) warn("Links count mismatch %zu != %zu", nlinks, num_links_exp); if(hist_covg) { ltree_update_covg_hists(<ree, (uint64_t*)hists, hist_distsize, hist_covgsize); } if(clean) { ltree_clean(<ree, cutoff); } // Accumulate statistics ltree_get_stats(<ree, &tree_stats); num_links = tree_stats.num_links - init_num_links; init_num_links = tree_stats.num_links; if(list) { ltree_write_list(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, list_fh) != outbuf.end) die("Cannot write CSV file to: %s", csv_out_path); strbuf_reset(&outbuf); } if(save && num_links) { ltree_write_ctp(<ree, kmerbuf.b, num_links, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, link_tmp_fh) != outbuf.end) die("Cannot write ctp file to: %s", link_tmp_path.b); strbuf_reset(&outbuf); } if(plot && knum == plot_kmer_idx) { status("Plotting tree..."); ltree_write_dot(<ree, &outbuf); if(fwrite(outbuf.b, 1, outbuf.end, plot_fh) != outbuf.end) die("Cannot write plot DOT file to: %s", plot_out_path); strbuf_reset(&outbuf); } } gpath_reader_close(&ctpin); cJSON *links_json = json_hdr_get(newhdr, "paths", cJSON_Object, link_out_path); cJSON *nkmers_json = json_hdr_get(links_json, "num_kmers_with_paths", cJSON_Number, link_out_path); cJSON *nlinks_json = json_hdr_get(links_json, "num_paths", cJSON_Number, link_out_path); cJSON *nbytes_json = json_hdr_get(links_json, "path_bytes", cJSON_Number, link_out_path); status("Number of kmers with links %li -> %zu", nkmers_json->valueint, tree_stats.num_trees_with_links); status("Number of links %li -> %zu", nlinks_json->valueint, tree_stats.num_links); status("Number of bytes %li -> %zu", nbytes_json->valueint, tree_stats.num_link_bytes); if(save) { // Update JSON nkmers_json->valuedouble = nkmers_json->valueint = tree_stats.num_trees_with_links; nlinks_json->valuedouble = nlinks_json->valueint = tree_stats.num_links; nbytes_json->valuedouble = nbytes_json->valueint = tree_stats.num_link_bytes; char *json_str = cJSON_Print(newhdr); if(gzputs(link_gz, json_str) != (int)strlen(json_str)) die("Cannot write ctp file to: %s", link_out_path); free(json_str); gzputs(link_gz, "\n\n"); gzputs(link_gz, ctp_explanation_comment); gzputs(link_gz, "\n"); fseek(link_tmp_fh, 0, SEEK_SET); char *tmp = ctx_malloc(4*ONE_MEGABYTE); size_t s; while((s = fread(tmp, 1, 4*ONE_MEGABYTE, link_tmp_fh)) > 0) { if(gzwrite(link_gz, tmp, s) != (int)s) die("Cannot write to output: %s", link_out_path); } ctx_free(tmp); gzclose(link_gz); fclose(link_tmp_fh); } // Write histogram to file if(hist_fh) { size_t i, j; fprintf(hist_fh, " "); for(j = 1; j < hist_covgsize; j++) fprintf(hist_fh, ",covg.%02zu", j); fprintf(hist_fh, "\n"); for(i = 1; i < hist_distsize; i++) { fprintf(hist_fh, "dist.%02zu", i); for(j = 1; j < hist_covgsize; j++) { fprintf(hist_fh, ",%"PRIu64, hists[i][j]); } fprintf(hist_fh, "\n"); } } if(thresh_fh) { // Use median of first five cutoffs print_suggest_cutoff(6, hist_covgsize, hists, thresh_fh); } if(hist_fh && hist_fh != stdout) fclose(hist_fh); if(list) { fclose(list_fh); } if(plot) { fclose(plot_fh); } ctx_free(hists); cJSON_Delete(newhdr); strbuf_dealloc(&link_tmp_path); ltree_dealloc(<ree); size_buf_dealloc(&countbuf); size_buf_dealloc(&jposbuf); strbuf_dealloc(&kmerbuf); strbuf_dealloc(&juncsbuf); strbuf_dealloc(&seqbuf); strbuf_dealloc(&outbuf); return EXIT_SUCCESS; }