Пример #1
0
// Orient supernode
// Once oriented, supernode has lowest possible kmerkey at the beginning,
// oriented FORWARDs if possible
void supernode_normalise(dBNode *nlist, size_t len, const dBGraph *db_graph)
{
  // Sort supernode into forward orientation
  ctx_assert(len > 0);

  if(len == 1) {
    nlist[0].orient = FORWARD;
    return;
  }

  BinaryKmer bkmer0 = db_node_get_bkmer(db_graph, nlist[0].key);
  BinaryKmer bkmer1 = db_node_get_bkmer(db_graph, nlist[len-1].key);

  // Check if closed cycle
  if(supernode_is_closed_cycle(nlist, len, bkmer0, bkmer1, db_graph))
  {
    // find lowest kmer to start from
    BinaryKmer lowest = bkmer0, tmp;
    size_t i, idx = 0;
    for(i = 1; i < len; i++) {
      tmp = db_node_get_bkmer(db_graph, nlist[i].key);
      if(binary_kmer_less_than(tmp, lowest)) {
        lowest = tmp;
        idx = i;
      }
    }

    // If already starting from the lowest kmer no change needed
    if(idx > 0 || nlist[0].orient != FORWARD)
    {
      // a->b->c->d->e->f->a
      // if c is lowest and FORWARD:  c->d->e->f->a->b (keep orientations)
      // if c is lowest and REVERSE:  c->b->a->f->e->d (reverse orientations)

      if(nlist[idx].orient == FORWARD) {
        // Shift left by idx, without affecting orientations
        db_nodes_left_shift(nlist, len, idx);
      } else {
        db_nodes_reverse_complement(nlist, idx+1);
        db_nodes_reverse_complement(nlist+idx+1, len-idx-1);
      }
    }
  }
  else if(binary_kmer_less_than(bkmer1,bkmer0)) {
    db_nodes_reverse_complement(nlist, len);
  }
}
Пример #2
0
static void test_kmer_occur_filter()
{
  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  const size_t kmer_size = 11, ncols = 3;
  size_t i;

  // Create graph
  db_graph_alloc(&graph, kmer_size, ncols, 1, 2000,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  //      xyz------->>>      y         >  <         X
  // TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA

  #define NUM_NODES 3
  #define NUM_READS 3

  const char *tmp[NUM_READS]
  = {
    "AACA",
    "TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA",
    "TCTAGCATGTGTGTT"};

  read_t reads[NUM_READS];
  for(i = 0; i < NUM_READS; i++) {
    seq_read_alloc(&reads[i]);
    seq_read_set(&reads[i], tmp[i]);
  }

  KOGraph kograph = kograph_create(reads, NUM_READS, true, 0, 1, &graph);

  TASSERT(kograph.nchroms == NUM_READS);
  TASSERT(kograph.koccurs != NULL);

  KOccurRunBuffer koruns, koruns_tmp, koruns_ended;
  korun_buf_alloc(&koruns, 16);
  korun_buf_alloc(&koruns_tmp, 16);
  korun_buf_alloc(&koruns_ended, 16);

  // Check CCCGACAGGGCAA starts at CCCGACAGGGC
  // x=CCCGACAGGGC, y=CCGACAGGGCA, z=CGACAGGGCAA
  // X=GCCCTGTCGGG, Y=TGCCCTGTCGG, Z=TTGCCCTGTCG
  dBNode nodes[NUM_NODES];
  for(i = 0; i < NUM_NODES; i++)
    nodes[i] = db_graph_find_str(&graph, &"CCCGACAGGGCAA"[i]);

  korun_buf_reset(&koruns);
  korun_buf_reset(&koruns_ended);
  kograph_filter_extend(&kograph, nodes, NUM_NODES, true, 0, 0,
                        &koruns, &koruns_tmp, &koruns_ended);

  // Checks
  TASSERT2(koruns.len == 1, "koruns.len: %zu", koruns.len);
  TASSERT(koruns.b[0].strand == STRAND_PLUS); // left-to-right with ref
  TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom);
  TASSERT2(koruns.b[0].first == 5, "offset: %zu", (size_t)koruns.b[0].first);
  TASSERT2(koruns.b[0].last == 7, "last: %zu", (size_t)koruns.b[0].last);

  // Test reverse
  db_nodes_reverse_complement(nodes, NUM_NODES);

  korun_buf_reset(&koruns);
  korun_buf_reset(&koruns_ended);
  kograph_filter_extend(&kograph, nodes, 1, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended);
  kograph_filter_extend(&kograph, nodes+1, 1, true, 0, 1, &koruns, &koruns_tmp, &koruns_ended);
  kograph_filter_extend(&kograph, nodes+2, 1, true, 0, 2, &koruns, &koruns_tmp, &koruns_ended);

  // Print out for debugging
  // printf("koruns: ");
  // koruns_print(koruns.b, koruns.len, kmer_size, stdout);
  // printf("\nkoruns_ended: ");
  // koruns_print(koruns_ended.b, koruns_ended.len, kmer_size, stdout);
  // printf("\n");

  // Check results match:
  // koruns: chromid:1:17-5:-, chromid:1:37-47:+
  // koruns_ended: chromid:1:34-24:-
  TASSERT2(koruns.len == 2, "koruns.len: %zu", koruns.len);
  TASSERT2(koruns_ended.len == 1, "koruns_ended.len: %zu", koruns_ended.len);
  TASSERT(koruns.b[0].strand == STRAND_MINUS); // reverse complement of ref
  TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom);
  TASSERT2(koruns.b[0].first == 7, "offset: %zu", (size_t)koruns.b[0].first);
  TASSERT2(koruns.b[0].last == 5, "last: %zu", (size_t)koruns.b[0].last);

  korun_buf_dealloc(&koruns);
  korun_buf_dealloc(&koruns_tmp);
  korun_buf_dealloc(&koruns_ended);

  for(i = 0; i < NUM_READS; i++) seq_read_dealloc(&reads[i]);
  kograph_dealloc(&kograph);

  db_graph_dealloc(&graph);
}
Пример #3
0
// Potential bubble - filter ref and duplicate alleles
static void print_bubble(BubbleCaller *caller,
                         GCacheStep **steps, size_t num_paths)
{
  const BubbleCallingPrefs prefs = caller->prefs;
  const dBGraph *db_graph = caller->db_graph;
  GCacheSnode *snode;
  size_t i;

  dBNodeBuffer *flank5p = &caller->flank5p;
  if(flank5p->len == 0)
  {
    // Haven't fetched 5p flank yet
    // flank5p[0] already contains the first node
    flank5p->len = 1;
    supernode_extend(flank5p, prefs.max_flank_len, db_graph);
    db_nodes_reverse_complement(flank5p->b, flank5p->len);
  }

  //
  // Print Bubble
  //

  // write to string buffer then flush to gzFile
  StrBuf *sbuf = &caller->output_buf;
  strbuf_reset(sbuf);

  // Temporary node buffer to use
  dBNodeBuffer *pathbuf = &caller->pathbuf;
  db_node_buf_reset(pathbuf);

  // Get bubble number (threadsafe num_bubbles_ptr++)
  size_t id = __sync_fetch_and_add((volatile size_t*)caller->num_bubbles_ptr, 1);

  // This can be set to anything without a '.' in it
  const char prefix[] = "call";

  // 5p flank
  // strbuf_sprintf(sbuf, ">bubble.%s%zu.5pflank kmers=%zu\n", prefix, id, flank5p->len);
  strbuf_append_str(sbuf, ">bubble.");
  strbuf_append_str(sbuf, prefix);
  strbuf_append_ulong(sbuf, id);
  strbuf_append_str(sbuf, ".5pflank kmers=");
  strbuf_append_ulong(sbuf, flank5p->len);
  strbuf_append_char(sbuf, '\n');
  branch_to_str(flank5p->b, flank5p->len, true, sbuf, db_graph);

  // 3p flank
  db_node_buf_reset(pathbuf);
  snode = graph_cache_snode(&caller->cache, steps[0]->supernode);
  graph_cache_snode_fetch_nodes(&caller->cache, snode, steps[0]->orient, pathbuf);

  // strbuf_sprintf(sbuf, ">bubble.%s%zu.3pflank kmers=%zu\n", prefix, id, pathbuf->len);
  strbuf_append_str(sbuf, ">bubble.");
  strbuf_append_str(sbuf, prefix);
  strbuf_append_ulong(sbuf, id);
  strbuf_append_str(sbuf, ".3pflank kmers=");
  strbuf_append_ulong(sbuf, pathbuf->len);
  strbuf_append_char(sbuf, '\n');
  branch_to_str(pathbuf->b, pathbuf->len, false, sbuf, db_graph);

  // Print alleles
  for(i = 0; i < num_paths; i++)
  {
    db_node_buf_reset(pathbuf);
    graph_cache_step_fetch_nodes(&caller->cache, steps[i], pathbuf);

    // strbuf_sprintf(sbuf, ">bubble.%s%zu.branch.%zu kmers=%zu\n",
    //                prefix, id, i, pathbuf->len);
    strbuf_append_str(sbuf, ">bubble.");
    strbuf_append_str(sbuf, prefix);
    strbuf_append_ulong(sbuf, id);
    strbuf_append_str(sbuf, ".branch.");
    strbuf_append_ulong(sbuf, i);
    strbuf_append_str(sbuf, " kmers=");
    strbuf_append_ulong(sbuf, pathbuf->len);
    strbuf_append_char(sbuf, '\n');

    branch_to_str(pathbuf->b, pathbuf->len, false, sbuf, db_graph);
  }

  strbuf_append_char(sbuf, '\n');

  ctx_assert(strlen(sbuf->b) == sbuf->end);

  // lock, print, unlock
  pthread_mutex_lock(caller->out_lock);
  gzwrite(caller->gzout, sbuf->b, sbuf->end);
  pthread_mutex_unlock(caller->out_lock);
}
Пример #4
0
// Walk the graph remembering the last time we met the ref
// When traversal fails, dump sequence up to last meeting with the ref
static void follow_break(BreakpointCaller *caller, dBNode node)
{
  size_t i, j, k, num_next;
  dBNode next_nodes[4];
  Nucleotide next_nucs[4];
  size_t nonref_idx[4], num_nonref_next = 0;
  const dBGraph *db_graph = caller->db_graph;

  BinaryKmer bkey = db_node_get_bkmer(db_graph, node.key);
  Edges edges = db_node_get_edges(db_graph, node.key, 0);

  num_next = db_graph_next_nodes(db_graph, bkey, node.orient, edges,
                                 next_nodes, next_nucs);

  // Filter out next nodes in the reference
  for(i = 0; i < num_next; i++) {
    if(kograph_num(caller->kograph, next_nodes[i].key) == 0) {
      nonref_idx[num_nonref_next] = i;
      num_nonref_next++;
    }
  }

  // Abandon if all options are in ref or none are
  if(num_nonref_next == num_next || num_nonref_next == 0) return;

  // Follow all paths not in ref, in all colours
  GraphCrawler *fw_crawler = &caller->crawlers[node.orient];
  GraphCrawler *rv_crawler = &caller->crawlers[!node.orient];
  dBNodeBuffer *allelebuf = &caller->allelebuf, *flank5pbuf = &caller->flank5pbuf;
  GCMultiColPath *flank5p_multicolpath, *allele_multicolpath;
  KOccurRun *flank5p_runs, *flank3p_runs;
  size_t flank5p_pathid, allele_pathid;
  size_t num_flank5p_runs, num_flank3p_runs;

  // We fetch 5' flanks in all colours then merge matching paths
  // we stop fetching a single path if it stops tracking the reference
  // Alternatively, we could fetch the 5' flank in everyone and stop after a
  // given distance, then check for that set of paths how much it tracks the
  // reference. This has the advantage of scaling much better with number of
  // samples, but not so well as min_ref_nkmers increases (since we fetch
  // many flanks that can't be used) - I think this is less of a worry.

  // Loop over possible next nodes at this junction
  for(i = 0; i < num_nonref_next; i++)
  {
    size_t next_idx = nonref_idx[i];

    // Go backwards to get 5p flank
    traverse_5pflank(caller, rv_crawler, db_node_reverse(next_nodes[next_idx]),
                     db_node_reverse(node));

    // Loop over the flanks we got
    for(j = 0; j < rv_crawler->num_paths; j++)
    {
      // Get 5p flank
      db_node_buf_reset(flank5pbuf);
      graph_crawler_get_path_nodes(rv_crawler, j, flank5pbuf);
      flank5p_multicolpath = &rv_crawler->multicol_paths[j];
      flank5p_pathid = flank5p_multicolpath->pathid;

      // Fetch 3pflank ref position
      num_flank5p_runs = caller->flank5p_refs[flank5p_pathid].num_runs;
      flank5p_runs = fetch_ref_contact(&rv_crawler->cache, flank5p_pathid,
                                       caller->flank5p_refs,
                                       &caller->flank5p_run_buf);

      koruns_reverse(flank5p_runs, num_flank5p_runs, flank5pbuf->len);
      koruns_sort_by_qoffset(flank5p_runs, num_flank5p_runs);
      db_nodes_reverse_complement(flank5pbuf->data, flank5pbuf->len);

      if(num_flank5p_runs > 0)
      {
        // Reset caller
        kmer_run_buf_reset(&caller->koruns_3p);
        kmer_run_buf_reset(&caller->koruns_3p_ended);
        kmer_run_buf_reset(&caller->allele_run_buf);

        // functions gcrawler_path_stop_at_ref_covg(),
        //           gcrawler_path_finish_ref_covg()
        // both fill koruns_3p, koruns_3p_ended and allele_run_buf

        // Only traverse in the colours we have a flank for
        graph_crawler_fetch(fw_crawler, node,
                            next_nodes, next_nucs, next_idx, num_next,
                            flank5p_multicolpath->cols,
                            flank5p_multicolpath->num_cols,
                            gcrawler_path_stop_at_ref_covg,
                            gcrawler_path_finish_ref_covg,
                            caller);

        // Assemble contigs - fetch forwards for each path for given 5p flank
        for(k = 0; k < fw_crawler->num_paths; k++)
        {
          // Fetch nodes
          db_node_buf_reset(allelebuf);
          graph_crawler_get_path_nodes(fw_crawler, k, allelebuf);
          ctx_assert(allelebuf->len > 0);

          allele_multicolpath = &fw_crawler->multicol_paths[k];
          allele_pathid = allele_multicolpath->pathid;

          // Fetch 3pflank ref position
          num_flank3p_runs = caller->allele_refs[allele_pathid].num_runs;
          flank3p_runs = fetch_ref_contact(&fw_crawler->cache, allele_pathid,
                                           caller->allele_refs,
                                           &caller->allele_run_buf);

          process_contig(caller,
                         allele_multicolpath->cols,
                         allele_multicolpath->num_cols,
                         flank5pbuf, allelebuf,
                         flank5p_runs, num_flank5p_runs,
                         flank3p_runs, num_flank3p_runs);
        }
      }
    }
  }
}
Пример #5
0
static inline
int test_statement_node(dBNode node, ExpABCWorker *wrkr)
{
  const dBGraph *db_graph = wrkr->db_graph;
  dBNodeBuffer *nbuf = &wrkr->nbuf;
  GraphWalker *wlk = &wrkr->gwlk;
  RepeatWalker *rpt = &wrkr->rptwlk;
  size_t b_idx, col = wrkr->colour;

  // rpt_walker_clear(rpt);

  db_node_buf_reset(nbuf);
  db_node_buf_add(nbuf, node);

  // size_t AB_limit = wrkr->prime_AB ? SIZE_MAX : wrkr->max_AB_dist;
  size_t walk_limit = wrkr->max_AB_dist;
  // status("walk_limit: %zu", walk_limit);

  // Walk from B to find A
  graph_walker_setup(wlk, true, col, col, db_graph);
  graph_walker_start(wlk, nbuf->b[0]);

  while(graph_walker_next(wlk) && nbuf->len < walk_limit) {
    if(!rpt_walker_attempt_traverse(rpt, wlk)) {
      reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT;
    }
    db_node_buf_add(nbuf, wlk->node);
  }

  reset(wlk,rpt,nbuf);

  if(nbuf->len == 1) return RES_NO_TRAVERSAL;

  // Traverse A->B
  db_nodes_reverse_complement(nbuf->b, nbuf->len);
  b_idx = nbuf->len - 1;

  if(wrkr->prime_AB)
  {
    // Prime A->B without attempting to cross
    graph_walker_prime(wlk, nbuf->b, nbuf->len, nbuf->len, true);

    while(graph_walker_next(wlk)) {
      if(!rpt_walker_attempt_traverse(rpt, wlk)) {
        reset(wlk,rpt,nbuf); return RES_LOST_IN_RPT;
      }
      db_node_buf_add(nbuf, wlk->node);
    }
  }
  else
  {
    // Attempt to traverse A->B then extend past B
    int r = confirm_seq(0, true, wlk, rpt, nbuf, col, db_graph);
    switch(r) {
      case CONFIRM_REPEAT: return RES_LOST_IN_RPT;
      case CONFIRM_OVERSHOT: ctx_assert2(0,"Can't 'overshoot' when extending");
      case CONFIRM_WRONG: return RES_AB_WRONG;
      case CONFIRM_SHORT:
        if(wrkr->print_failed_contigs)
          print_failed(node, nbuf, db_graph, true, wrkr->prime_AB);
        wrkr->ab_fail_state[wlk->last_step.status]++;
        return RES_AB_FAILED;
    }
  }

  reset(wlk,rpt,nbuf);

  if(nbuf->len == b_idx+1) return RES_NO_TRAVERSAL; // Couldn't get past B

  // Last node is now C
  // Walk from B... record whether or not we reach C
  ctx_assert(db_nodes_are_equal(nbuf->b[b_idx], db_node_reverse(node)));

  int r = confirm_seq(b_idx, false, wlk, rpt, nbuf, col, db_graph);
  switch(r) {
    case CONFIRM_REPEAT: return RES_LOST_IN_RPT;
    case CONFIRM_OVERSHOT: return RES_BC_OVERSHOT;
    case CONFIRM_WRONG: return RES_BC_WRONG;
    case CONFIRM_SHORT:
      if(wrkr->print_failed_contigs)
        print_failed(node, nbuf, db_graph, false, wrkr->prime_AB);
      wrkr->bc_fail_state[wlk->last_step.status]++;
      return RES_BC_FAILED;
    case CONFIRM_SUCCESS: return RES_ABC_SUCCESS;
  }

  die("Shouldn't reach here: r=%i", r);
  return -1;
}