char* graph_step_status2str(enum GraphStepStatus status, char *str, size_t len) { ctx_assert(len >= 20); (void)len; ctx_assert(status < GRPHWLK_NUM_STATES); strcpy(str, graph_step_str[status]); return str; }
size_t infer_edges(size_t nthreads, bool add_all_edges, const dBGraph *db_graph) { ctx_assert(db_graph->node_in_cols != NULL); ctx_assert(db_graph->col_edges != NULL); size_t i, num_nodes_modified = 0; status("[inferedges] Processing stream"); InferEdgesWorker *wrkrs = ctx_calloc(nthreads, sizeof(InferEdgesWorker)); for(i = 0; i < nthreads; i++) { InferEdgesWorker tmp = {.threadid = i, .nthreads = nthreads, .add_all_edges = add_all_edges, .db_graph = db_graph, .num_nodes_modified = 0}; memcpy(&wrkrs[i], &tmp, sizeof(InferEdgesWorker)); } util_run_threads(wrkrs, nthreads, sizeof(InferEdgesWorker), nthreads, infer_edges_worker); // Sum up nodes modified for(i = 0; i < nthreads; i++) num_nodes_modified += wrkrs[i].num_nodes_modified; ctx_free(wrkrs); return num_nodes_modified; }
char* assem2str(enum AssemStopCause assem, char *str, size_t size) { ctx_assert(assem < ASSEM_NUM_STOPS); ctx_assert(strlen(assem_stop_str[assem]) < size); strcpy(str, assem_stop_str[assem]); return str; }
// Edges restricted to this colour, only in one direction (node.orient) Edges db_node_edges_in_col(dBNode node, size_t col, const dBGraph *db_graph) { if(db_graph->num_edge_cols > 1) { Edges edges = db_node_get_edges(db_graph, node.key, col); return edges_mask_orientation(edges, node.orient); } // Edges are merged into one colour ctx_assert(db_graph->num_edge_cols == 1); ctx_assert(db_graph->node_in_cols != NULL || db_graph->col_covgs != NULL); // Check which next nodes are in the given colour dBNode nodes[4]; Nucleotide nucs[4]; Edges edges = 0; size_t i, n; n = db_graph_next_nodes_in_col(db_graph, node, col, nodes, nucs); for(i = 0; i < n; i++) edges = edges_set_edge(edges, nucs[i], node.orient); return edges; }
void hash_table_empty(HashTable *const ht) { memset(ht->table, 0, ht->capacity * sizeof(BinaryKmer)); memset(ht->buckets, 0, ht->num_of_buckets * sizeof(uint8_t[2])); HashTable data = { .table = ht->table, .num_of_buckets = ht->num_of_buckets, .hash_mask = ht->hash_mask, .bucket_size = ht->bucket_size, .capacity = ht->capacity, .buckets = ht->buckets, .num_kmers = 0, .collisions = {0}}; memcpy(ht, &data, sizeof(data)); } static inline const BinaryKmer* hash_table_find_in_bucket(const HashTable *const ht, uint_fast32_t bucket, BinaryKmer bkmer) { const BinaryKmer *ptr = ht_bckt_ptr(ht, bucket); const BinaryKmer *end = ptr + hash_table_bsize(ht, bucket); bkmer.b[0] |= BKMER_SET_FLAG; // mark as assigned in the hash table while(ptr < end) { if(binary_kmer_eq(bkmer, *ptr)) return ptr; ptr++; } return NULL; // Not found } // Remember to increment ht->num_kmers static inline BinaryKmer* hash_table_insert_in_bucket(HashTable *ht, uint_fast32_t bucket, BinaryKmer bkmer) { size_t bsize = hash_table_bsize(ht, bucket); size_t bitems = hash_table_bitems(ht, bucket); ctx_assert(bitems < ht->bucket_size); ctx_assert(bitems <= bsize); BinaryKmer *ptr = ht_bckt_ptr(ht, bucket); bkmer.b[0] |= BKMER_SET_FLAG; // mark as assigned in the hash table if(bitems == bsize) { ptr += bsize; ht->buckets[bucket][HT_BSIZE]++; } else { // Find an entry that has been deleted from this bucket previously while(HASH_ENTRY_ASSIGNED(*ptr)) ptr++; } *ptr = bkmer; ht->buckets[bucket][HT_BITEMS]++; return ptr; }
// Returns 1 if a read is a substring of ANY read in the list or a complete // match with a read before it in the list. Returns <= 0 otherwise. // 1 => is substr // 0 => not substr // -1 => not enough bases of ACGT static int _is_substr(const ReadBuffer *rbuf, size_t idx, const KOGraph *kograph, const dBGraph *db_graph) { const size_t kmer_size = db_graph->kmer_size; const read_t *r = &rbuf->b[idx], *r2; size_t contig_start; contig_start = seq_contig_start(r, 0, kmer_size, 0, 0); if(contig_start >= r->seq.end) return -1; // No kmers in this sequence dBNode node = db_graph_find_str(db_graph, r->seq.b+contig_start); ctx_assert(node.key != HASH_NOT_FOUND); // expect at least one hit (for this read!) ctx_assert(kograph_occurs(kograph, node.key)); KOccur *hit; for(hit = kograph_get(kograph, node.key); 1; hit++) { if(hit->chrom != idx) { r2 = &rbuf->b[hit->chrom]; // A read is a duplicate (i.e. return 1) if it is a substring of ANY // read in the list or a complete match with a read before it in the list. // That is why we have: (hit->chrom < idx || r->seq.end < r2->seq.end) // since identical strings have equal length if(hit->chrom < idx || r->seq.end < r2->seq.end) { if(hit->orient == node.orient) { // potential FORWARD match if(hit->offset >= contig_start && hit->offset + r->seq.end <= r2->seq.end && strncasecmp(r->seq.b, r2->seq.b+hit->offset-contig_start, r->seq.end) == 0) { return 1; } } else { // potential REVERSE match // if read is '<NNNN>[kmer]<rem>' rX_rem is the number of chars after // the first valid kmer size_t r1_rem = r->seq.end - (contig_start + kmer_size); size_t r2_rem = r2->seq.end - (hit->offset + kmer_size); if(r1_rem <= hit->offset && r2_rem >= contig_start && dna_revncasecmp(r->seq.b, r2->seq.b+hit->offset-r1_rem, r->seq.end) == 0) { return 1; } } } } if(!hit->next) break; } return 0; }
/** * @param right_edge is true iff we this kmer is the last in a unitig */ static inline void _print_edge(hkey_t node, bool right_edge, BinaryKmer bkey, Edges edges, UnitigEnd uend0, UnitigPrinter *p) { // DOT: leave from east end if +, west end if - // connect to west end if +, east end if - const char dot_exit[2] = "ew", dot_join[2] = "we", gfa_orient[2] = "+-"; size_t i, n; dBNode next_nodes[4]; Nucleotide next_nucs[4]; Orientation orient = right_edge ? uend0.rorient : !uend0.lorient; // Unitig orientations Orientation ut_or0 = right_edge ? FORWARD : REVERSE, ut_or1; n = db_graph_next_nodes(p->db_graph, bkey, orient, edges, next_nodes, next_nucs); for(i = 0; i < n; i++) { UnitigEnd uend1 = p->ugraph.unitig_ends[next_nodes[i].key]; char tmpstr[100]; db_node_to_str(p->db_graph, next_nodes[i], tmpstr); if(!uend1.assigned) status(" -> node %zu [%s]", uend1.unitigid, tmpstr); ctx_assert(next_nodes[i].key != HASH_NOT_FOUND); ctx_assert(uend1.assigned); ut_or1 = next_nodes[i].orient == uend1.lorient ? FORWARD : REVERSE; // Don't do reverse-to-reverse links when node links to itself, // these are duplicates of forward-to-forward if(node < next_nodes[i].key || (node == next_nodes[i].key && ut_or0 + ut_or1 < 2)) { pthread_mutex_lock(&p->outlock); switch(p->syntax) { case PRINT_DOT: fprintf(p->fout, " node%zu:%c -> node%zu:%c\n", (size_t)uend0.unitigid, dot_exit[ut_or0], (size_t)uend1.unitigid, dot_join[ut_or1]); break; case PRINT_GFA: fprintf(p->fout, "L\tnode%zu\t%c\tnode%zu\t%c\t%zuM\n", (size_t)uend0.unitigid, gfa_orient[ut_or0], (size_t)uend1.unitigid, gfa_orient[ut_or1], p->db_graph->kmer_size - 1); break; default: die("Bad syntax: %i", p->syntax); } pthread_mutex_unlock(&p->outlock); } } }
// Using file so can call fseek and don't need to load whole graph static size_t inferedges_on_mmap(const dBGraph *db_graph, bool add_all_edges, GraphFileReader *file) { ctx_assert(db_graph->num_of_cols == file->hdr.num_of_cols); ctx_assert(file_filter_is_direct(&file->fltr)); ctx_assert2(!isatty(fileno(file->fh)), "Use inferedges_on_stream() instead"); ctx_assert(file->num_of_kmers >= 0); ctx_assert(file->file_size >= 0); status("[inferedges] Processing mmap file: %s [hdr: %zu bytes file: %zu bytes]", file_filter_path(&file->fltr), (size_t)file->hdr_size, (size_t)file->file_size); if(fseek(file->fh, 0, SEEK_SET) != 0) die("fseek failed: %s", strerror(errno)); // Open memory mapped file void *mmap_ptr = mmap(NULL, file->file_size, PROT_WRITE, MAP_SHARED, fileno(file->fh), 0); if(mmap_ptr == MAP_FAILED) die("Cannot memory map file: %s [%s]", file->fltr.path.b, strerror(errno)); const size_t ncols = file->hdr.num_of_cols; BinaryKmer bkmer; Edges edges[ncols]; Covg covgs[ncols]; bool updated; size_t i, num_kmers = file->num_of_kmers, num_kmers_edited = 0; size_t filekmersize = sizeof(BinaryKmer) + (sizeof(Edges)+sizeof(Covg)) * ncols; char *ptr = (char*)mmap_ptr + file->hdr_size; for(i = 0; i < num_kmers; i++, ptr += filekmersize) { char *fh_covgs = ptr + sizeof(BinaryKmer); char *fh_edges = fh_covgs + sizeof(Covg)*ncols; memcpy(bkmer.b, ptr, sizeof(BinaryKmer)); memcpy(covgs, fh_covgs, ncols * sizeof(Covg)); memcpy(edges, fh_edges, ncols * sizeof(Edges)); updated = (add_all_edges ? infer_all_edges(bkmer, edges, covgs, db_graph) : infer_pop_edges(bkmer, edges, covgs, db_graph)); if(updated) { memcpy(fh_covgs, covgs, ncols * sizeof(Covg)); memcpy(fh_edges, edges, ncols * sizeof(Edges)); num_kmers_edited++; } } if(munmap(mmap_ptr, file->file_size) == -1) die("Cannot release mmap file: %s [%s]", file->fltr.path.b, strerror(errno)); return num_kmers_edited; }
// `node1` should be the first node of a supernode // `node0` should be the previous node // `next_base` is the last base of `node1` // `jmpfunc` is called with each supernode traversed and if it returns true // we continue crawling, otherwise we stop // `endfunc` is a function called at the end of traversal void graph_crawler_fetch(GraphCrawler *crawler, dBNode node0, dBNode next_nodes[4], size_t take_idx, size_t num_next, uint32_t *cols, size_t ncols, bool (*jmpfunc)(GraphCache *_cache, GCacheStep *_step, void *_arg), void (*endfunc)(GraphCache *_cache, uint32_t _pathid, void *_arg), void *arg) { const dBGraph *db_graph = crawler->cache.db_graph; GraphCache *cache = &crawler->cache; GraphWalker *wlk = &crawler->wlk; RepeatWalker *rptwlk = &crawler->rptwlk; GCUniColPath *unipaths = crawler->unicol_paths; ctx_assert(take_idx < num_next); ctx_assert(!db_nodes_are_equal(node0, next_nodes[take_idx])); // Fetch all paths in all colours dBNode node1 = next_nodes[take_idx]; bool is_fork; size_t i, c, col, nedges_cols, num_unicol_paths = 0; int pathid; for(c = 0; c < ncols; c++) { col = (cols != NULL ? cols[c] : c); if(db_node_has_col(db_graph, node0.key, col) && db_node_has_col(db_graph, node1.key, col)) { // Determine if this fork is a fork in the current colour for(nedges_cols = 0, i = 0; i < num_next && nedges_cols <= 1; i++) nedges_cols += db_node_has_col(db_graph, next_nodes[i].key, col); is_fork = (nedges_cols > 1); graph_walker_setup(wlk, true, col, col, db_graph); graph_walker_start(wlk, node0); graph_walker_force(wlk, node1, is_fork); pathid = graph_crawler_load_path(cache, node1, wlk, rptwlk, jmpfunc, arg); if(endfunc != NULL) endfunc(cache, pathid, arg); graph_walker_finish(wlk); graph_crawler_reset_rpt_walker(rptwlk, cache, pathid); unipaths[num_unicol_paths++] = (GCUniColPath){.colour = col, .pathid = pathid}; } else pathid = -1; crawler->col_paths[col] = pathid; }
// Returns sorted array of hkey_t from the hash table hkey_t* hash_table_sorted(const HashTable *htable) { ctx_assert(sizeof(hkey_t) == sizeof(BinaryKmer*)); ctx_assert(sizeof(hkey_t) == sizeof(BkmerPtrHkeyUnion)); BkmerPtrHkeyUnion *kmers, *nxt, *end; nxt = kmers = ctx_malloc(sizeof(BkmerPtrHkeyUnion) * htable->num_kmers); end = kmers + htable->num_kmers; HASH_ITERATE(htable, _fetch_kmer_union, htable, &nxt); // Can sort ignoring that the top flag bit is set on all kmers qsort(kmers, htable->num_kmers, sizeof(BinaryKmer*), binary_kmers_qcmp_ptrs); for(nxt = kmers; nxt < end; nxt++) nxt->h = nxt->bptr - htable->table; return (hkey_t*)kmers; }
void seq_reader_orient_mp_FF_or_RR(read_t *r1, read_t *r2, ReadMateDir matedir) { ctx_assert(r1 != NULL); ctx_assert(r2 != NULL); switch(matedir) { case READPAIR_FF: return; case READPAIR_FR: seq_read_reverse_complement(r2); return; case READPAIR_RF: seq_read_reverse_complement(r1); return; case READPAIR_RR: return; default: ctx_assert2(0, "Invalid ReadMateDir value: %i", (int)matedir); } // ^default should be unreachable }
/** * Save paths to a file. * @param gzout gzFile to write to * @param path path of output file * @param save_path_seq if true, save seq= and juncpos= for links, requires * exactly one colour in the graph * @param hdrs is array of JSON headers of input files */ void gpath_save(gzFile gzout, const char *path, size_t nthreads, bool save_path_seq, const char *cmdstr, cJSON *cmdhdr, cJSON **hdrs, size_t nhdrs, const ZeroSizeBuffer *contig_hists, size_t ncols, dBGraph *db_graph) { ctx_assert(nthreads > 0); ctx_assert(gpath_set_has_nseen(&db_graph->gpstore.gpset)); ctx_assert(ncols == db_graph->gpstore.gpset.ncols); ctx_assert(!save_path_seq || db_graph->num_of_cols == 1); // save_path => 1 colour char npaths_str[50]; ulong_to_str(db_graph->gpstore.num_paths, npaths_str); status("Saving %s paths to: %s", npaths_str, path); status(" using %zu threads", nthreads); // Write header cJSON *json = gpath_save_mkhdr(path, cmdstr, cmdhdr, hdrs, nhdrs, contig_hists, ncols, db_graph); json_hdr_gzprint(json, gzout); cJSON_Delete(json); // Print comments about the format gzputs(gzout, ctp_explanation_comment); // Multithreaded GPathSaver *wrkrs = ctx_calloc(nthreads, sizeof(GPathSaver)); pthread_mutex_t outlock; size_t i; if(pthread_mutex_init(&outlock, NULL) != 0) die("Mutex init failed"); for(i = 0; i < nthreads; i++) { wrkrs[i] = (GPathSaver){.threadid = i, .nthreads = nthreads, .save_seq = save_path_seq, .gzout = gzout, .outlock = &outlock, .db_graph = db_graph}; } // Iterate over kmers writing paths util_run_threads(wrkrs, nthreads, sizeof(*wrkrs), nthreads, gpath_save_thread); pthread_mutex_destroy(&outlock); ctx_free(wrkrs); status("[GPathSave] Graph paths saved to %s", path); }
/** * Generate a JSON header object for a .ctp file * @param path path to output file * @param cmdstr name of the command being run, to be used to add @cmdhdr * @param cmdhdr JSON header to add under current command->@cmdstr * If cmdstr and cmdhdr are both NULL they are ignored * @param contig_hist histgram of read contig lengths * @param hist_len length of array contig_hist */ cJSON* gpath_save_mkhdr(const char *path, const char *cmdstr, cJSON *cmdhdr, cJSON **hdrs, size_t nhdrs, const ZeroSizeBuffer *contig_hists, size_t ncols, const dBGraph *db_graph) { ctx_assert(!cmdstr == !cmdhdr); const GPathStore *gpstore = &db_graph->gpstore; const GPathSet *gpset = &gpstore->gpset; // using json_hdr_make_std() assumes the following ctx_assert(gpset->ncols == db_graph->num_of_cols); // Construct cJSON cJSON *jsonhdr = cJSON_CreateObject(); cJSON_AddStringToObject(jsonhdr, "file_format", "ctp"); cJSON_AddNumberToObject(jsonhdr, "format_version", CTP_FORMAT_VERSION); // Add standard cortex header info, including the command being run json_hdr_make_std(jsonhdr, path, hdrs, nhdrs, db_graph, hash_table_nkmers(&db_graph->ht)); // Get first command (this one), and command specific extra info if(cmdstr) { cJSON *cmd = json_hdr_get_curr_cmd(jsonhdr, path); cJSON_AddItemToObject(cmd, cmdstr, cmdhdr); } // Paths info cJSON *paths = cJSON_CreateObject(); cJSON_AddItemToObject(jsonhdr, "paths", paths); // Add command specific header fields cJSON_AddNumberToObject(paths, "num_kmers_with_paths", gpstore->num_kmers_with_paths); cJSON_AddNumberToObject(paths, "num_paths", gpstore->num_paths); cJSON_AddNumberToObject(paths, "path_bytes", gpstore->path_bytes); // Add size distribution cJSON *json_hists = cJSON_CreateArray(); cJSON_AddItemToObject(paths, "contig_hists", json_hists); size_t i; for(i = 0; i < ncols; i++) _gpath_save_contig_hist2json(json_hists, contig_hists[i].b, contig_hists[i].len); return jsonhdr; }
/** * Calculate cleaning threshold for supernodes from a given distribution * of supernode coverages * @param covgs histogram of supernode coverages */ size_t cleaning_pick_supernode_threshold(const uint64_t *covgs, size_t len, double seq_depth, const dBGraph *db_graph) { ctx_assert(len > 5); ctx_assert(db_graph->ht.num_kmers > 0); size_t i, d1len = len-2, d2len = len-3, f1, f2; double *tmp = ctx_malloc((d1len+d2len) * sizeof(double)); double *delta1 = tmp, *delta2 = tmp + d1len; // Get sequencing depth from coverage uint64_t covg_sum = 0, capacity = db_graph->ht.capacity * db_graph->num_of_cols; for(i = 0; i < capacity; i++) covg_sum += db_graph->col_covgs[i]; double seq_depth_est = (double)covg_sum / db_graph->ht.num_kmers; status("[cleaning] Kmer depth before cleaning supernodes: %.2f", seq_depth_est); if(seq_depth <= 0) seq_depth = seq_depth_est; else status("[cleaning] Using sequence depth argument: %f", seq_depth); size_t fallback_thresh = (size_t)MAX2(1, (seq_depth+1)/2); // +1 to ensure covgs is never 0 for(i = 0; i < d1len; i++) delta1[i] = (double)(covgs[i+1]+1) / (covgs[i+2]+1); d1len = i; d2len = d1len - 1; if(d1len <= 2) { status("[cleaning] (using fallback1)\n"); ctx_free(tmp); return fallback_thresh; } // d2len is d1len-1 for(i = 0; i < d2len; i++) delta2[i] = delta1[i] / delta1[i+1]; for(f1 = 0; f1 < d1len && delta1[f1] >= 1; f1++); for(f2 = 0; f2 < d2len && delta2[f2] > 1; f2++); ctx_free(tmp); if(f1 < d1len && f1 < (seq_depth*0.75)) { status("[cleaning] (using f1)"); return f1+1; } else if(f2 < d2len) { status("[cleaning] (using f2)"); return f2+1; } else { status("[cleaning] (using fallback1)"); return fallback_thresh+1; } }
// Safe to call on different entries at the same time // NOT safe to do find() whilst doing delete() void hash_table_delete(HashTable *const ht, hkey_t pos) { uint64_t bucket = pos / ht->bucket_size; ctx_assert(pos != HASH_NOT_FOUND); ctx_assert(ht->buckets[bucket][HT_BITEMS] > 0); ctx_assert(ht->num_kmers > 0); ctx_assert(HASH_ENTRY_ASSIGNED(ht->table[pos])); ht->table[pos] = unset_bkmer; __sync_fetch_and_sub((volatile uint8_t *)&ht->buckets[bucket][HT_BITEMS], 1); __sync_fetch_and_sub((volatile uint64_t *)&ht->num_kmers, 1); ctx_assert(!HASH_ENTRY_ASSIGNED(ht->table[pos])); }
// Safe to call on different entries at the same time // NOT safe to do find() whilst doing delete() void hash_table_delete(HashTable *const ht, hkey_t pos) { uint64_t bucket = pos / ht->bucket_size, n, m; ctx_assert(pos != HASH_NOT_FOUND); ctx_assert(HASH_ENTRY_ASSIGNED(ht->table[pos])); memset(ht->table+pos, 0, sizeof(BinaryKmer)); n = __sync_fetch_and_sub((volatile uint64_t *)&ht->num_kmers, 1); m = __sync_fetch_and_sub((volatile uint8_t *)&ht->buckets[bucket][HT_BITEMS], 1); ctx_assert2(n > 0, "Deleted from empty table"); ctx_assert2(m > 0, "Deleted from empty bucket"); ctx_assert(!HASH_ENTRY_ASSIGNED(ht->table[pos])); }
void graph_crawler_alloc(GraphCrawler *crawler, const dBGraph *db_graph) { ctx_assert(db_graph->node_in_cols != NULL); size_t ncols = db_graph->num_of_cols; int *col_paths = ctx_calloc(ncols, sizeof(int)); GCMultiColPath *multicol_paths = ctx_calloc(ncols, sizeof(GCMultiColPath)); GCUniColPath *unicol_paths = ctx_calloc(ncols, sizeof(GCUniColPath)); uint32_t *col_list = ctx_calloc(ncols, sizeof(uint32_t)); GraphCrawler tmp = {.num_paths = 0, .col_paths = col_paths, .multicol_paths = multicol_paths, .unicol_paths = unicol_paths, .col_list = col_list}; memcpy(crawler, &tmp, sizeof(GraphCrawler)); graph_cache_alloc(&crawler->cache, db_graph); graph_walker_alloc(&crawler->wlk, db_graph); rpt_walker_alloc(&crawler->rptwlk, db_graph->ht.capacity, 22); // 4MB } void graph_crawler_dealloc(GraphCrawler *crawler) { ctx_free(crawler->col_paths); ctx_free(crawler->multicol_paths); ctx_free(crawler->unicol_paths); ctx_free(crawler->col_list); graph_cache_dealloc(&crawler->cache); graph_walker_dealloc(&crawler->wlk); rpt_walker_dealloc(&crawler->rptwlk); memset(crawler, 0, sizeof(GraphCrawler)); // reset }
static inline void gcrawler_finish_ref_covg(BreakpointCaller *caller, uint32_t pathid, KOccurRunBuffer *koruns, KOccurRunBuffer *koruns_ended, KOccurRunBuffer *runs_buf, PathRefRun *ref_runs) { size_t init_len = runs_buf->len; // Copy finished runs into array kmer_run_buf_ensure_capacity(runs_buf, runs_buf->len+koruns->len+koruns_ended->len); kmer_run_buf_append(runs_buf, koruns_ended->data, koruns_ended->len); runs_buf->len += koruns_filter(koruns->data, koruns->len, runs_buf->data+runs_buf->len, caller->min_ref_nkmers); kmer_run_buf_reset(koruns); kmer_run_buf_reset(koruns_ended); ctx_assert(pathid < MAX_REFRUNS_PER_ORIENT(caller->db_graph->num_of_cols)); ref_runs[pathid].first_runid = init_len; ref_runs[pathid].num_runs = runs_buf->len - init_len; }
// Traverse from node0 -> node1 static void traverse_5pflank(BreakpointCaller *caller, GraphCrawler *crawler, dBNode node0, dBNode node1) { const dBGraph *db_graph = crawler->cache.db_graph; dBNode next_nodes[4]; Nucleotide next_nucs[4]; size_t i, num_next; BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, node0.key); num_next = db_graph_next_nodes(db_graph, bkmer0, node0.orient, db_node_edges(db_graph, node0.key, 0), next_nodes, next_nucs); // Find index of previous node for(i = 0; i < num_next && !db_nodes_are_equal(next_nodes[i],node1); i++) {} ctx_assert(i < num_next && db_nodes_are_equal(next_nodes[i],node1)); kmer_run_buf_reset(&caller->koruns_5p); kmer_run_buf_reset(&caller->koruns_5p_ended); kmer_run_buf_reset(&caller->flank5p_run_buf); // Go backwards to get 5p flank // NULL means loop from 0..(ncols-1) graph_crawler_fetch(crawler, node0, next_nodes, next_nucs, i, num_next, NULL, db_graph->num_of_cols, gcrawler_flank5p_stop_at_ref_covg, gcrawler_flank5p_finish_ref_covg, caller); }
int main(int argc, char **argv) { (void)argc; (void)argv; cortex_init(); cmd_init(argc, argv); if(argc != 3) die("usage: ./debug <in.ctp> <in.ctx>"); const char *out_path = argv[2]; GPathReader pfile; memset(&pfile, 0, sizeof(GPathReader)); gpath_reader_open(&pfile, argv[1], true); status("Got file with %zu colours", pfile.ncolours); size_t i, kmer_size = 7, ncols = 3; gpath_reader_check(&pfile, kmer_size, ncols); gzFile gzout = futil_gzopen_create(out_path, "w"); dBGraph db_graph; db_graph_alloc(&db_graph, kmer_size, ncols, 1, 1024, DBG_ALLOC_EDGES); // Create a path store that tracks path counts gpath_store_alloc(&db_graph.gpstore, db_graph.num_of_cols, db_graph.ht.capacity, ONE_MEGABYTE, true, false); // Create path hash table for fast lookup gpath_hash_alloc(&db_graph.gphash, &db_graph.gpstore, ONE_MEGABYTE); // Set sample names for(i = 0; i < pfile.ncolours; i++) { const char *sample_name = gpath_reader_get_sample_name(&pfile, i); ctx_assert(sample_name != NULL); strbuf_set(&db_graph.ginfo[i].sample_name, sample_name); } // Load path files, add kmers that are missing gpath_reader_load(&pfile, GPATH_ADD_MISSING_KMERS, &db_graph); hash_table_print_stats(&db_graph.ht); // Write output file gpath_save(gzout, out_path, 1, true, NULL, NULL, &pfile.json, 1, &db_graph); gzclose(gzout); // Checks // gpath_checks_all_paths(&db_graph, 2); // use two threads gpath_checks_counts(&db_graph); // Clean up gpath_reader_close(&pfile); db_graph_dealloc(&db_graph); cortex_destroy(); return EXIT_SUCCESS; }
// Always adds new path. If newpath could be a duplicate, use gpathhash // Note: it is not safe to call _add and _find_add simultaneously, since _add // avoids the use of locks. GPath* gpath_store_add_mt(GPathStore *gpstore, hkey_t hkey, GPathNew newgpath) { ctx_assert(newgpath.seq != NULL); GPath *gpath = gpath_set_add_mt(&gpstore->gpset, newgpath); _gpstore_add_to_llist_mt(gpstore, hkey, gpath); return gpath; }
// @intocols value to set all intocols to void file_filter_flatten(FileFilter *fltr, size_t intocol) { size_t i; ctx_assert(fltr->filter.b != NULL); for(i = 0; i < file_filter_num(fltr); i++) file_filter_intocol(fltr,i) = intocol; file_filter_update(fltr); }
void assemble_contigs_stats_merge(AssembleContigStats *dst, const AssembleContigStats *src) { ctx_assert(dst->lengths.len == dst->junctns.len); ctx_assert(dst->lengths.len == dst->num_contigs); ctx_assert(src->lengths.len == src->junctns.len); ctx_assert(src->lengths.len == src->num_contigs); size_t i; size_buf_push(&dst->lengths, src->lengths.b, src->lengths.len); size_buf_push(&dst->junctns, src->junctns.b, src->junctns.len); dst->num_contigs += src->num_contigs; dst->total_len += src->total_len; dst->total_junc += src->total_junc; for(i = 0; i < 5; i++) dst->contigs_outdegree[i] += src->contigs_outdegree[i]; for(i = 0; i < AC_MAX_PATHS; i++) { dst->paths_held[i] += src->paths_held[i]; dst->paths_cntr[i] += src->paths_cntr[i]; } dst->paths_held_max = MAX2(dst->paths_held_max, src->paths_held_max); dst->paths_cntr_max = MAX2(dst->paths_cntr_max, src->paths_cntr_max); for(i = 0; i < GRPHWLK_NUM_STATES; i++) dst->grphwlk_steps[i] += src->grphwlk_steps[i]; for(i = 0; i < ASSEM_NUM_STOPS; i++) dst->stop_causes[i] += src->stop_causes[i]; dst->max_junc_density = MAX2(dst->max_junc_density, src->max_junc_density); dst->num_contigs_from_seed_kmers += src->num_contigs_from_seed_kmers; dst->num_contigs_from_seed_paths += src->num_contigs_from_seed_paths; dst->num_reseed_abort += src->num_reseed_abort; dst->num_seeds_not_found += src->num_seeds_not_found; }
void cleaning_write_len_histogram(const char *path, const uint64_t *hist, size_t len, size_t kmer_size) { ctx_assert(len >= 2); ctx_assert(hist[0] == 0); size_t i, end; FILE *fout = _open_histogram_file(path, "unitig length"); if(fout == NULL) return; fprintf(fout, "UnitigKmerLength,bp,Count\n"); for(end = len-1; end > 1 && hist[end] == 0; end--) {} fprintf(fout, "1,%zu,%"PRIu64"\n", kmer_size, hist[1]); for(i = 2; i <= end; i++) { if(hist[i] > 0) fprintf(fout, "%zu,%zu,%"PRIu64"\n", i, kmer_size+i-1, hist[i]); } fclose(fout); }
// Get coverages from nodes in nbuf, store in cbuf static inline void fetch_coverages(dBNodeBuffer nbuf, CovgBuffer *cbuf, const dBGraph *db_graph) { ctx_assert(db_graph->num_of_cols == 1); size_t i; covg_buf_reset(cbuf); covg_buf_capacity(cbuf, nbuf.len); cbuf->len = nbuf.len; for(i = 0; i < nbuf.len; i++) cbuf->b[i] = db_graph->col_covgs[nbuf.b[i].key]; }
// Return 1 if changed; 0 otherwise bool infer_pop_edges(const BinaryKmer node_bkey, Edges *edges, const Covg *covgs, const dBGraph *db_graph) { Edges uedges = 0, iedges = 0xf, add_edges, edge; size_t orient, nuc, col, kmer_size = db_graph->kmer_size; const size_t ncols = db_graph->num_of_cols; BinaryKmer bkey, bkmer; hkey_t next; Edges newedges[ncols]; // char tmp[MAX_KMER_SIZE+1]; // binary_kmer_to_str(node_bkey, db_graph->kmer_size, tmp); // status("Inferring %s", tmp); for(col = 0; col < ncols; col++) { uedges |= edges[col]; // union of edges iedges &= edges[col]; // intersection of edges newedges[col] = edges[col]; } add_edges = uedges & ~iedges; if(!add_edges) return 0; for(orient = 0; orient < 2; orient++) { bkmer = (orient == FORWARD ? binary_kmer_left_shift_one_base(node_bkey, kmer_size) : binary_kmer_right_shift_one_base(node_bkey)); for(nuc = 0; nuc < 4; nuc++) { edge = nuc_orient_to_edge(nuc, orient); if(add_edges & edge) { // get next bkmer, look up in graph if(orient == FORWARD) binary_kmer_set_last_nuc(&bkmer, nuc); else binary_kmer_set_first_nuc(&bkmer, dna_nuc_complement(nuc), kmer_size); bkey = bkmer_get_key(bkmer, kmer_size); next = hash_table_find(&db_graph->ht, bkey); ctx_assert(next != HASH_NOT_FOUND); for(col = 0; col < ncols; col++) if(covgs[col] > 0 && db_node_has_col(db_graph, next, col)) newedges[col] |= edge; } } } int cmp = memcmp(edges, newedges, sizeof(Edges)*ncols); memcpy(edges, newedges, sizeof(Edges)*ncols); return (cmp != 0); }
void cleaning_write_covg_histogram(const char *path, const uint64_t *covg_hist, const uint64_t *mean_covg_hist, size_t len) { ctx_assert(len >= 2); ctx_assert(covg_hist[0] == 0); ctx_assert(mean_covg_hist[0] == 0); size_t i, end; FILE *fout = _open_histogram_file(path, "unitig coverage"); if(fout == NULL) return; fprintf(fout, "Covg,NumKmers,NumUnitigs\n"); for(end = len-1; end > 2 && covg_hist[end] == 0; end--) {} for(i = 1; i <= end; i++) { if(covg_hist[i] > 0) fprintf(fout, "%zu,%"PRIu64",%"PRIu64"\n", i, covg_hist[i], mean_covg_hist[i]); } fclose(fout); }
static void process_contig(BreakpointCaller *caller, const uint32_t *cols, size_t ncols, const dBNodeBuffer *flank5p, const dBNodeBuffer *allelebuf, const KOccurRun *flank5p_runs, size_t num_flank5p_runs, const KOccurRun *flank3p_runs, size_t num_flank3p_runs) { gzFile gzout = caller->gzout; KOGraph kograph = caller->kograph; const size_t kmer_size = caller->db_graph->kmer_size; ctx_assert(ncols > 0); // we never re-met the ref if(num_flank3p_runs == 0) return; // Find first place we meet the ref size_t callid = __sync_fetch_and_add((volatile size_t*)caller->callid, 1); // Swallow up some of the path into the 3p flank size_t i, flank3pidx = flank3p_runs[0].qoffset; size_t extra3pbases = MIN2(kmer_size-1, flank3pidx); size_t num_path_kmers = flank3pidx - extra3pbases; size_t kmer3poffset = kmer_size-1-extra3pbases; pthread_mutex_lock(caller->out_lock); // 5p flank with list of ref intersections gzprintf(gzout, ">brkpnt.%zu.5pflank chr=", callid); koruns_gzprint(gzout, kmer_size, kograph, flank5p_runs, num_flank5p_runs, 0, 0); gzputc(gzout, '\n'); db_nodes_gzprint(flank5p->data, flank5p->len, caller->db_graph, gzout); gzputc(gzout, '\n'); // 3p flank with list of ref intersections gzprintf(gzout, ">brkpnt.%zu.3pflank chr=", callid); koruns_gzprint(gzout, kmer_size, kograph, flank3p_runs, num_flank3p_runs, flank3pidx, kmer3poffset); gzputc(gzout, '\n'); db_nodes_gzprint_cont(allelebuf->data+num_path_kmers, allelebuf->len-num_path_kmers, caller->db_graph, gzout); gzputc(gzout, '\n'); // Print path with list of colours gzprintf(gzout, ">brkpnt.%zu.path cols=%zu", callid, cols[0]); for(i = 1; i < ncols; i++) gzprintf(gzout, ",%zu", cols[i]); gzputc(gzout, '\n'); db_nodes_gzprint_cont(allelebuf->data, num_path_kmers, caller->db_graph, gzout); gzprintf(gzout, "\n\n"); pthread_mutex_unlock(caller->out_lock); }
/** * @param cpy_flnk_5p how many characters to copy from end of 5' flank to start of allele * @param cpy_flnk_3p how many characters to copy from end of 3' flank to end of allele */ static void align_entry_allele(const char *line, size_t linelen, const char *flank5p, size_t flank5p_len, const char *flank3p, size_t flank3p_len, size_t cpy_flnk_5p, size_t cpy_flnk_3p, const read_t *chr, size_t ref_start, size_t ref_end, bool fw_strand, const char *info, const char **genotypes, StrBuf *tmpbuf, FILE *fout) { (void)flank3p_len; ctx_assert(ref_start <= ref_end); // Ref allele const char *ref_allele = chr->seq.b + ref_start; size_t ref_len = ref_end-ref_start; // Construct alt allele const char *alt_allele; size_t alt_len; if(cpy_flnk_5p + cpy_flnk_3p == 0 && fw_strand) { alt_allele = line; alt_len = linelen; } else { strbuf_reset(tmpbuf); strbuf_append_strn(tmpbuf, flank5p+flank5p_len-cpy_flnk_5p, cpy_flnk_5p); strbuf_append_strn(tmpbuf, line, linelen); strbuf_append_strn(tmpbuf, flank3p, cpy_flnk_3p); if(!fw_strand) dna_revcomp_str(tmpbuf->b, tmpbuf->b, tmpbuf->end); alt_allele = tmpbuf->b; alt_len = tmpbuf->end; } // printf("%.*s vs %.*s\n", (int)(ref_end-ref_start), chr->seq.b + ref_start, // (int)alt_len, seq); // Align chrom and seq needleman_wunsch_align2(ref_allele, alt_allele, ref_len, alt_len, &nw_scoring_allele, nw_aligner, aln); num_nw_allele++; // Break into variants and print VCF align_biallelic(aln->result_a, aln->result_b, chr, ref_start, info, genotypes, fout); }
// Return sum of bases on right of alignment with: // * hard masked (H) // * soft masked (S) // * inserted bases relative to ref (I) static inline uint32_t bam_get_end_padding(int n_cigar, const uint32_t *cigar) { ctx_assert(n_cigar > 0); uint32_t i, l = 0; const uint32_t c = (1<<BAM_CINS)|(1<<BAM_CSOFT_CLIP)|(1<<BAM_CHARD_CLIP); for(i = n_cigar-1; i > 0; i--) if((c >> bam_cigar_op(cigar[i])) & 1) l += bam_cigar_oplen(cigar[i]); return l; }