// Traverse from node0 -> node1
static void traverse_5pflank(BreakpointCaller *caller, GraphCrawler *crawler,
                             dBNode node0, dBNode node1)
{
  const dBGraph *db_graph = crawler->cache.db_graph;
  dBNode next_nodes[4];
  Nucleotide next_nucs[4];
  size_t i, num_next;
  BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, node0.key);

  num_next = db_graph_next_nodes(db_graph, bkmer0, node0.orient,
                                 db_node_edges(db_graph, node0.key, 0),
                                 next_nodes, next_nucs);

  // Find index of previous node
  for(i = 0; i < num_next && !db_nodes_are_equal(next_nodes[i],node1); i++) {}

  ctx_assert(i < num_next && db_nodes_are_equal(next_nodes[i],node1));

  kmer_run_buf_reset(&caller->koruns_5p);
  kmer_run_buf_reset(&caller->koruns_5p_ended);
  kmer_run_buf_reset(&caller->flank5p_run_buf);

  // Go backwards to get 5p flank
  // NULL means loop from 0..(ncols-1)
  graph_crawler_fetch(crawler, node0,
                      next_nodes, next_nucs, i, num_next,
                      NULL, db_graph->num_of_cols,
                      gcrawler_flank5p_stop_at_ref_covg,
                      gcrawler_flank5p_finish_ref_covg,
                      caller);
}
示例#2
0
// Orient supernode
// Once oriented, supernode has lowest possible kmerkey at the beginning,
// oriented FORWARDs if possible
void supernode_normalise(dBNode *nlist, size_t len, const dBGraph *db_graph)
{
  // Sort supernode into forward orientation
  ctx_assert(len > 0);

  if(len == 1) {
    nlist[0].orient = FORWARD;
    return;
  }

  BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, nlist[0].key);
  BinaryKmer bkmer1 = db_node_get_bkmer(db_graph, nlist[len-1].key);

  // Check if closed cycle
  if(supernode_is_closed_cycle(nlist, len, bkmer0, bkmer1, db_graph))
  {
    // find lowest kmer to start from
    BinaryKmer lowest = bkmer0, tmp;
    size_t i, idx = 0;
    for(i = 1; i < len; i++) {
      tmp = db_node_get_bkmer(db_graph, nlist[i].key);
      if(binary_kmer_less_than(tmp, lowest)) {
        lowest = tmp;
        idx = i;
      }
    }

    // If already starting from the lowest kmer no change needed
    if(idx > 0 || nlist[0].orient != FORWARD)
    {
      // a->b->c->d->e->f->a
      // if c is lowest and FORWARD:  c->d->e->f->a->b (keep orientations)
      // if c is lowest and REVERSE:  c->b->a->f->e->d (reverse orientations)

      if(nlist[idx].orient == FORWARD) {
        // Shift left by idx, without affecting orientations
        db_nodes_left_shift(nlist, len, idx);
      } else {
        db_nodes_reverse_complement(nlist, idx+1);
        db_nodes_reverse_complement(nlist+idx+1, len-idx-1);
      }
    }
  }
  else if(binary_kmer_less_than(bkmer1,bkmer0)) {
    db_nodes_reverse_complement(nlist, len);
  }
}
示例#3
0
// Get bkey:orient string representation e.g. "AGAGTTTTATC:1".
//   :0 means forward, :1 means reverse
//   `str` must be at least kmer_size+3 chars long
// Returns length in bytes. Null terminates `str`.
size_t db_node_to_str(const dBGraph *db_graph, dBNode node, char *str)
{
  const size_t kmer_size = db_graph->kmer_size;
  BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key);
  binary_kmer_to_str(bkmer, kmer_size, str);
  str[kmer_size] = ':';
  str[kmer_size+1] = '0' + node.orient;
  str[kmer_size+2] = '\0';
  return kmer_size + 2;
}
示例#4
0
// Print:
// 0: AAACCCAAATGCAAACCCAAATGCAAACCCA:1 TGGGTTTGCATTTGGGTTTGCATTTGGGTTT
// 1: CAAACCCAAATGCAAACCCAAATGCAAACCC:1 GGGTTTGCATTTGGGTTTGCATTTGGGTTTG
// ...
void db_nodes_print_verbose(const dBNode *nodes, size_t num,
                            const dBGraph *db_graph, FILE *out)
{
  if(num == 0) return;

  const size_t kmer_size = db_graph->kmer_size;
  size_t i;
  BinaryKmer bkmer, bkey;
  char kmerstr[MAX_KMER_SIZE+1], keystr[MAX_KMER_SIZE+1];

  bkmer = db_node_get_bkmer(db_graph, nodes[0].key);
  bkey = db_node_oriented_bkmer(db_graph, nodes[0]);
  binary_kmer_to_str(bkmer, kmer_size, kmerstr);
  binary_kmer_to_str(bkey, kmer_size, keystr);
  fprintf(out, "%3zu: %s:%i %s\n", (size_t)0, kmerstr, (int)nodes[0].orient, keystr);

  for(i = 1; i < num; i++) {
    bkmer = db_node_get_bkmer(db_graph, nodes[i].key);
    bkey = db_node_oriented_bkmer(db_graph, nodes[i]);
    binary_kmer_to_str(bkmer, kmer_size, kmerstr);
    binary_kmer_to_str(bkey, kmer_size, keystr);
    fprintf(out, "%3zu: %s:%i %s\n", i, kmerstr, (int)nodes[i].orient, keystr);
  }
}
示例#5
0
// For every kmer in the graph, we run this function
static inline bool print_edges(hkey_t hkey, size_t threadid, void *arg)
{
  (void)threadid;
  UnitigPrinter *p = (UnitigPrinter*)arg;
  UnitigEnd uend = p->ugraph.unitig_ends[hkey];

  // Check if node is an end of a unitig
  if(uend.assigned) {
    BinaryKmer bkey = db_node_get_bkmer(p->db_graph, hkey);
    Edges edges = db_node_get_edges(p->db_graph, hkey, 0);

    if(uend.left) {
      _print_edge(hkey, false, bkey, edges, uend, p);
    }
    if(uend.right) {
      _print_edge(hkey, true, bkey, edges, uend, p);
    }
  }

  return false; // keep iterating
}
示例#6
0
// Returns number of bytes added
size_t db_nodes_to_str(const dBNode *nodes, size_t num,
                       const dBGraph *db_graph, char *str)
{
  if(num == 0) return 0;

  size_t i;
  size_t kmer_size = db_graph->kmer_size;
  BinaryKmer bkmer = db_node_get_bkmer(db_graph, nodes[0].key);
  Nucleotide nuc;

  binary_kmer_to_str(bkmer, kmer_size, str);
  if(nodes[0].orient == REVERSE) dna_reverse_complement_str(str, kmer_size);

  for(i = 1; i < num; i++) {
    nuc = db_node_get_last_nuc(nodes[i], db_graph);
    str[kmer_size+i-1] = dna_nuc_to_char(nuc);
  }

  str[kmer_size+num-1] = '\0';
  return kmer_size+num-1;
}
示例#7
0
// Edges restricted to this colour, only in one direction (node.orient)
Edges db_node_edges_in_col(dBNode node, size_t col, const dBGraph *db_graph)
{
  if(db_graph->node_in_cols == NULL && db_graph->col_covgs == NULL) {
    Edges edges = db_node_get_edges(db_graph, node.key, col);
    return edges_mask_orientation(edges, node.orient);
  }

  // Edges are merged into one colour
  ctx_assert(db_graph->num_edge_cols == 1);
  ctx_assert(db_graph->node_in_cols != NULL || db_graph->col_covgs != NULL);

  Edges edges = db_node_get_edges(db_graph, node.key, 0);

  // Check which next nodes are in the given colour
  BinaryKmer bkmer = db_node_get_bkmer(db_graph, node.key);
  dBNode nodes[4];
  Nucleotide nucs[4];
  size_t i, n;

  n = db_graph_next_nodes(db_graph, bkmer, node.orient,
                          edges, nodes, nucs);

  edges = 0;
  if(db_graph->node_in_cols != NULL) {
    for(i = 0; i < n; i++)
      if(db_node_has_col(db_graph, nodes[i].key, col))
        edges = edges_set_edge(edges, nucs[i], node.orient);
  }
  else if(db_graph->col_covgs != NULL) {
    for(i = 0; i < n; i++)
      if(db_node_col_covg(db_graph, nodes[i].key, col) > 0)
        edges = edges_set_edge(edges, nucs[i], node.orient);
  }
  else ctx_assert(0);

  return edges;
}
示例#8
0
static inline int infer_edges_node(hkey_t hkey,
                                   bool add_all_edges,
                                   Covg *tmp_covgs,
                                   const dBGraph *db_graph,
                                   size_t *num_nodes_modified)
{
  BinaryKmer bkmer = db_node_get_bkmer(db_graph, hkey);
  Edges *edges = &db_node_edges(db_graph, hkey, 0);
  size_t col;

  // Create coverages that are zero or one depending on if node has colour
  if(db_graph->col_covgs == NULL) {
    for(col = 0; col < db_graph->num_of_cols; col++)
      tmp_covgs[col] = db_node_has_col(db_graph, hkey, col);
  } else {
    tmp_covgs = &db_node_covg(db_graph, hkey, 0);
  }

  (*num_nodes_modified)
    += (add_all_edges ? infer_all_edges(bkmer, edges, tmp_covgs, db_graph)
                      : infer_pop_edges(bkmer, edges, tmp_covgs, db_graph));

  return 0; // => keep iterating
}
示例#9
0
void test_graph_crawler()
{
  test_status("Testing graph crawler...");

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  const size_t kmer_size = 11, ncols = 3;

  db_graph_alloc(&graph, kmer_size, ncols, 1, 2048,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  char graphseq[3][77] =
//           <               X                 X              X...............
{"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC",
 "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA",
 "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"};

  build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0]));
  build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1]));
  build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2]));

  // Crawl graph
  GraphCrawler crawler;
  graph_crawler_alloc(&crawler, &graph);

  dBNode node = db_graph_find_str(&graph, graphseq[0]);
  dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1);
  TASSERT(node.key != HASH_NOT_FOUND);
  TASSERT(next_node.key != HASH_NOT_FOUND);

  BinaryKmer bkey = db_node_get_bkmer(&graph, node.key);
  Edges edges = db_node_get_edges(&graph, node.key, 0);

  dBNode next_nodes[4];
  Nucleotide next_nucs[4];
  size_t i, p, num_next, next_idx;

  num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges,
                                 next_nodes, next_nucs);

  next_idx = 0;
  while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node))
    next_idx++;

  TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node));

  // Crawl in all colours
  graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next,
                      NULL, graph.num_of_cols, NULL, NULL, NULL);

  TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths);

  // Fetch paths
  dBNodeBuffer nbuf;
  db_node_buf_alloc(&nbuf, 16);
  StrBuf sbuf;
  strbuf_alloc(&sbuf, 128);

  for(p = 0; p < crawler.num_paths; p++) {
    db_node_buf_reset(&nbuf);
    graph_crawler_get_path_nodes(&crawler, p, &nbuf);
    strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size);
    sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b);
    for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {}
    TASSERT2(i < 3, "seq: %s", sbuf.b);
    TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end);
    TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len);
  }

  strbuf_dealloc(&sbuf);
  db_node_buf_dealloc(&nbuf);

  graph_crawler_dealloc(&crawler);

  db_graph_dealloc(&graph);
}
示例#10
0
// Walk the graph remembering the last time we met the ref
// When traversal fails, dump sequence up to last meeting with the ref
static void follow_break(BreakpointCaller *caller, dBNode node)
{
  size_t i, j, k, num_next;
  dBNode next_nodes[4];
  Nucleotide next_nucs[4];
  size_t nonref_idx[4], num_nonref_next = 0;
  const dBGraph *db_graph = caller->db_graph;

  BinaryKmer bkey = db_node_get_bkmer(db_graph, node.key);
  Edges edges = db_node_get_edges(db_graph, node.key, 0);

  num_next = db_graph_next_nodes(db_graph, bkey, node.orient, edges,
                                 next_nodes, next_nucs);

  // Filter out next nodes in the reference
  for(i = 0; i < num_next; i++) {
    if(kograph_num(caller->kograph, next_nodes[i].key) == 0) {
      nonref_idx[num_nonref_next] = i;
      num_nonref_next++;
    }
  }

  // Abandon if all options are in ref or none are
  if(num_nonref_next == num_next || num_nonref_next == 0) return;

  // Follow all paths not in ref, in all colours
  GraphCrawler *fw_crawler = &caller->crawlers[node.orient];
  GraphCrawler *rv_crawler = &caller->crawlers[!node.orient];
  dBNodeBuffer *allelebuf = &caller->allelebuf, *flank5pbuf = &caller->flank5pbuf;
  GCMultiColPath *flank5p_multicolpath, *allele_multicolpath;
  KOccurRun *flank5p_runs, *flank3p_runs;
  size_t flank5p_pathid, allele_pathid;
  size_t num_flank5p_runs, num_flank3p_runs;

  // We fetch 5' flanks in all colours then merge matching paths
  // we stop fetching a single path if it stops tracking the reference
  // Alternatively, we could fetch the 5' flank in everyone and stop after a
  // given distance, then check for that set of paths how much it tracks the
  // reference. This has the advantage of scaling much better with number of
  // samples, but not so well as min_ref_nkmers increases (since we fetch
  // many flanks that can't be used) - I think this is less of a worry.

  // Loop over possible next nodes at this junction
  for(i = 0; i < num_nonref_next; i++)
  {
    size_t next_idx = nonref_idx[i];

    // Go backwards to get 5p flank
    traverse_5pflank(caller, rv_crawler, db_node_reverse(next_nodes[next_idx]),
                     db_node_reverse(node));

    // Loop over the flanks we got
    for(j = 0; j < rv_crawler->num_paths; j++)
    {
      // Get 5p flank
      db_node_buf_reset(flank5pbuf);
      graph_crawler_get_path_nodes(rv_crawler, j, flank5pbuf);
      flank5p_multicolpath = &rv_crawler->multicol_paths[j];
      flank5p_pathid = flank5p_multicolpath->pathid;

      // Fetch 3pflank ref position
      num_flank5p_runs = caller->flank5p_refs[flank5p_pathid].num_runs;
      flank5p_runs = fetch_ref_contact(&rv_crawler->cache, flank5p_pathid,
                                       caller->flank5p_refs,
                                       &caller->flank5p_run_buf);

      koruns_reverse(flank5p_runs, num_flank5p_runs, flank5pbuf->len);
      koruns_sort_by_qoffset(flank5p_runs, num_flank5p_runs);
      db_nodes_reverse_complement(flank5pbuf->data, flank5pbuf->len);

      if(num_flank5p_runs > 0)
      {
        // Reset caller
        kmer_run_buf_reset(&caller->koruns_3p);
        kmer_run_buf_reset(&caller->koruns_3p_ended);
        kmer_run_buf_reset(&caller->allele_run_buf);

        // functions gcrawler_path_stop_at_ref_covg(),
        //           gcrawler_path_finish_ref_covg()
        // both fill koruns_3p, koruns_3p_ended and allele_run_buf

        // Only traverse in the colours we have a flank for
        graph_crawler_fetch(fw_crawler, node,
                            next_nodes, next_nucs, next_idx, num_next,
                            flank5p_multicolpath->cols,
                            flank5p_multicolpath->num_cols,
                            gcrawler_path_stop_at_ref_covg,
                            gcrawler_path_finish_ref_covg,
                            caller);

        // Assemble contigs - fetch forwards for each path for given 5p flank
        for(k = 0; k < fw_crawler->num_paths; k++)
        {
          // Fetch nodes
          db_node_buf_reset(allelebuf);
          graph_crawler_get_path_nodes(fw_crawler, k, allelebuf);
          ctx_assert(allelebuf->len > 0);

          allele_multicolpath = &fw_crawler->multicol_paths[k];
          allele_pathid = allele_multicolpath->pathid;

          // Fetch 3pflank ref position
          num_flank3p_runs = caller->allele_refs[allele_pathid].num_runs;
          flank3p_runs = fetch_ref_contact(&fw_crawler->cache, allele_pathid,
                                           caller->allele_refs,
                                           &caller->allele_run_buf);

          process_contig(caller,
                         allele_multicolpath->cols,
                         allele_multicolpath->num_cols,
                         flank5pbuf, allelebuf,
                         flank5p_runs, num_flank5p_runs,
                         flank3p_runs, num_flank3p_runs);
        }
      }
    }
  }
}
示例#11
0
// `fork_node` is a node with outdegree > 1
void find_bubbles(BubbleCaller *caller, dBNode fork_node)
{
  graph_cache_reset(&caller->cache);

  const dBGraph *db_graph = caller->db_graph;
  GraphCache *cache = &caller->cache;
  GraphWalker *wlk = &caller->wlk;
  RepeatWalker *rptwlk = &caller->rptwlk;

  // char tmpstr[MAX_KMER_SIZE+3];
  // db_node_to_str(db_graph, fork_node, tmpstr);
  // status("Calling from %s", tmpstr);

  dBNode nodes[4];
  Nucleotide bases[4];
  size_t i, num_next, num_edges_in_col;
  BinaryKmer fork_bkmer = db_node_get_bkmer(db_graph, fork_node.key);

  num_next = db_graph_next_nodes(db_graph, fork_bkmer, fork_node.orient,
                                 db_node_edges(db_graph, fork_node.key, 0),
                                 nodes, bases);

  // loop over alleles, then colours
  Colour colour, colours_loaded = db_graph->num_of_cols;
  bool node_has_col[4];

  uint32_t pathid;

  for(colour = 0; colour < colours_loaded; colour++)
  {
    if(!db_node_has_col(db_graph, fork_node.key, colour)) continue;

    // Determine if this fork is a fork in the current colour
    num_edges_in_col = 0;
    for(i = 0; i < num_next; i++) {
      node_has_col[i] = (db_node_has_col(db_graph, nodes[i].key, colour) > 0);
      num_edges_in_col += node_has_col[i];
    }

    graph_walker_setup(wlk, true, colour, colour, db_graph);

    for(i = 0; i < num_next; i++)
    {
      if(node_has_col[i])
      {
        graph_walker_start(wlk, fork_node);
        graph_walker_force(wlk, nodes[i], num_edges_in_col > 1);

        pathid = graph_crawler_load_path_limit(cache, nodes[i], wlk, rptwlk,
                                               caller->prefs.max_allele_len);

        graph_walker_finish(wlk);
        graph_crawler_reset_rpt_walker(rptwlk, cache, pathid);
      }
    }
  }

  // Set up 5p flank
  caller->flank5p.b[0] = db_node_reverse(fork_node);
  caller->flank5p.len = 0; // set to one to signify we haven't fetched flank yet
}