Beispiel #1
0
// `nbuf` and `sbuf` are temporary variables used by this function
static void _call_bubble(BubbleCaller *caller,
                         const char *flank5p, const char *flank3p,
                         const char **alleles, size_t num_alleles,
                         dBNodeBuffer *nbuf, StrBuf *sbuf)
{
  const dBGraph *graph = caller->db_graph;
  const size_t kmer_size = graph->kmer_size;

  dBNode node5p = db_graph_find_str(graph, flank5p+strlen(flank5p)-kmer_size);
  dBNode node3p = db_graph_find_str(graph, flank3p);
  TASSERT(node5p.key != HASH_NOT_FOUND);
  TASSERT(node3p.key != HASH_NOT_FOUND);

  Edges edges5p = db_node_get_edges_union(graph, node5p.key);
  Edges edges3p = db_node_get_edges_union(graph, node3p.key);
  TASSERT(edges_get_outdegree(edges5p, node5p.orient) > 1);
  TASSERT(edges_get_indegree(edges3p, node3p.orient) > 1);

  find_bubbles(caller, node5p);

  GCacheUnitig *snode3p;
  Orientation snorient3p;
  GCacheStepPtrBuf *stepbuf;

  // Get 3p flank and orientation
  snode3p = graph_cache_find_unitig(&caller->cache, node3p);
  TASSERT(snode3p != NULL);
  snorient3p = gc_unitig_get_orient(&caller->cache, snode3p, node3p);

  find_bubbles_ending_with(caller, snode3p);

  stepbuf = (snorient3p == FORWARD ? &caller->spp_forward : &caller->spp_reverse);

  _check_alleles(&caller->cache, stepbuf, alleles, num_alleles, nbuf, sbuf);
}
Beispiel #2
0
// Returns 1 if a read is a substring of ANY read in the list or a complete
// match with a read before it in the list. Returns <= 0 otherwise.
//  1 => is substr
//  0 => not substr
// -1 => not enough bases of ACGT
static int _is_substr(const ReadBuffer *rbuf, size_t idx,
                      const KOGraph *kograph, const dBGraph *db_graph)
{
  const size_t kmer_size = db_graph->kmer_size;
  const read_t *r = &rbuf->b[idx], *r2;
  size_t contig_start;

  contig_start = seq_contig_start(r, 0, kmer_size, 0, 0);
  if(contig_start >= r->seq.end) return -1; // No kmers in this sequence

  dBNode node = db_graph_find_str(db_graph, r->seq.b+contig_start);
  ctx_assert(node.key != HASH_NOT_FOUND);

  // expect at least one hit (for this read!)
  ctx_assert(kograph_occurs(kograph, node.key));
  KOccur *hit;

  for(hit = kograph_get(kograph, node.key); 1; hit++)
  {
    if(hit->chrom != idx)
    {
      r2 = &rbuf->b[hit->chrom];

      // A read is a duplicate (i.e. return 1) if it is a substring of ANY
      // read in the list or a complete match with a read before it in the list.
      // That is why we have: (hit->chrom < idx || r->seq.end < r2->seq.end)
      // since identical strings have equal length
      if(hit->chrom < idx || r->seq.end < r2->seq.end) {
        if(hit->orient == node.orient) {
          // potential FORWARD match
          if(hit->offset >= contig_start &&
             hit->offset + r->seq.end <= r2->seq.end &&
             strncasecmp(r->seq.b, r2->seq.b+hit->offset-contig_start, r->seq.end) == 0)
          {
            return 1;
          }
        }
        else {
          // potential REVERSE match
          // if read is '<NNNN>[kmer]<rem>' rX_rem is the number of chars after
          // the first valid kmer
          size_t r1_rem =  r->seq.end - (contig_start   + kmer_size);
          size_t r2_rem = r2->seq.end - (hit->offset + kmer_size);

          if(r1_rem <= hit->offset && r2_rem >= contig_start &&
             dna_revncasecmp(r->seq.b, r2->seq.b+hit->offset-r1_rem, r->seq.end) == 0)
          {
            return 1;
          }
        }
      }
    }

    if(!hit->next) break;
  }

  return 0;
}
Beispiel #3
0
static void _check_node_paths(const char *kmer,
                              const char **path_strs, size_t npaths,
                              size_t colour, const dBGraph *graph)
{
  TASSERT(strlen(kmer) == graph->kmer_size);

  const GPath *paths[npaths]; // corresponding to path_strs
  memset(paths, 0, sizeof(paths));
  size_t i, num_paths_seen = 0;

  const GPathStore *gpstore = &graph->gpstore;
  dBNode node = db_graph_find_str(graph, kmer);

  const GPath *path = gpath_store_fetch_traverse(gpstore, node.key);
  dBNodeBuffer nbuf;
  SizeBuffer jposbuf;
  db_node_buf_alloc(&nbuf, 64);
  size_buf_alloc(&jposbuf, 64);

  #define MAX_SEQ 128
  char seq[MAX_SEQ];

  for(; path != NULL; path = path->next)
  {
    if(path->orient == node.orient &&
       gpath_has_colour(path, gpstore->gpset.ncols, colour))
    {
      TASSERT(num_paths_seen < npaths);
      db_node_buf_reset(&nbuf);
      gpath_fetch(node, path, &nbuf, &jposbuf, colour, graph);
      if(nbuf.len > MAX_SEQ) die("Too many nodes. Cannot continue. %zu", nbuf.len);
      db_nodes_to_str(nbuf.b, nbuf.len, graph, seq);
      TASSERT(strlen(seq) == graph->kmer_size + nbuf.len - 1);
      for(i = 0; i < npaths; i++) {
        if(strcmp(path_strs[i],seq) == 0) {
          TASSERT(paths[i] == NULL, "Duplicate paths: %s", seq);
          paths[i] = path;
          break;
        }
      }
      TASSERT2(i < npaths, "Path not found: %s", seq);
      num_paths_seen++;
    }
  }

  TASSERT(num_paths_seen == npaths);

  for(i = 0; i < npaths; i++) {
    TASSERT2(paths[i] != NULL, "path not in graph: %s", path_strs[i]);
  }

  db_node_buf_dealloc(&nbuf);
  size_buf_dealloc(&jposbuf);
}
Beispiel #4
0
static void pull_out_supernodes(const char **seq, const char **ans, size_t n,
                                const dBGraph *graph)
{
  dBNodeBuffer nbuf;
  db_node_buf_alloc(&nbuf, 1024);

  // 1. Check pulling out supernodes works for iterating over the graph
  uint64_t *visited;
  visited = ctx_calloc(roundup_bits2words64(graph->ht.capacity), 8);
  HASH_ITERATE(&graph->ht, supernode_from_kmer,
               &nbuf, visited, graph, ans, n);
  ctx_free(visited);

  // 2. Check pulling out supernodes works when we iterate over inputs
  size_t i, j, len;
  dBNode node;
  char tmpstr[SNODEBUF];

  for(i = 0; i < n; i++) {
    len = strlen(seq[i]);
    for(j = 0; j+graph->kmer_size <= len; j++)
    {
      // Find node
      node = db_graph_find_str(graph, seq[i]+j);
      TASSERT(node.key != HASH_NOT_FOUND);

      // Fetch supernode
      db_node_buf_reset(&nbuf);
      supernode_find(node.key, &nbuf, graph);
      supernode_normalise(nbuf.b, nbuf.len, graph);

      // Compare
      TASSERT(nbuf.len < SNODEBUF);
      db_nodes_to_str(nbuf.b, nbuf.len, graph, tmpstr);
      if(strcmp(tmpstr, ans[i]) != 0) {
        test_status("Got: %s from ans[i]:%s\n", tmpstr, ans[i]);
      }
      TASSERT(strcmp(tmpstr, ans[i]) == 0);
    }
  }

  db_node_buf_dealloc(&nbuf);
}
void test_graph_crawler()
{
  test_status("Testing graph crawler...");

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  const size_t kmer_size = 11, ncols = 3;

  db_graph_alloc(&graph, kmer_size, ncols, 1, 2048,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  char graphseq[3][77] =
//           <               X                 X              X...............
{"GTTCCAGAGCGGAGGTCTCCCAACAACATGGTATAAGTTGTCTAGCCCCGGTTCGCGCGGGTACTTCTTACAGCGC",
 "GTTCCAGAGCGGAGGTCTCCCAACAACTTGGTATAAGTTGTCTAGTCCCGGTTCGCGCGGCATTTCAGCATTGTTA",
 "GTTCCAGAGCGCGACAGAGTGCATATCACGCTAAGCACAGCCCTCTTCTATCTGCTTTTAAATGGATCAATAATCG"};

  build_graph_from_str_mt(&graph, 0, graphseq[0], strlen(graphseq[0]));
  build_graph_from_str_mt(&graph, 1, graphseq[1], strlen(graphseq[1]));
  build_graph_from_str_mt(&graph, 2, graphseq[2], strlen(graphseq[2]));

  // Crawl graph
  GraphCrawler crawler;
  graph_crawler_alloc(&crawler, &graph);

  dBNode node = db_graph_find_str(&graph, graphseq[0]);
  dBNode next_node = db_graph_find_str(&graph, graphseq[0]+1);
  TASSERT(node.key != HASH_NOT_FOUND);
  TASSERT(next_node.key != HASH_NOT_FOUND);

  BinaryKmer bkey = db_node_get_bkmer(&graph, node.key);
  Edges edges = db_node_get_edges(&graph, node.key, 0);

  dBNode next_nodes[4];
  Nucleotide next_nucs[4];
  size_t i, p, num_next, next_idx;

  num_next = db_graph_next_nodes(&graph, bkey, node.orient, edges,
                                 next_nodes, next_nucs);

  next_idx = 0;
  while(next_idx < num_next && !db_nodes_are_equal(next_nodes[next_idx],next_node))
    next_idx++;

  TASSERT(next_idx < num_next && db_nodes_are_equal(next_nodes[next_idx],next_node));

  // Crawl in all colours
  graph_crawler_fetch(&crawler, node, next_nodes, next_idx, num_next,
                      NULL, graph.num_of_cols, NULL, NULL, NULL);

  TASSERT2(crawler.num_paths == 2, "crawler.num_paths: %u", crawler.num_paths);

  // Fetch paths
  dBNodeBuffer nbuf;
  db_node_buf_alloc(&nbuf, 16);
  StrBuf sbuf;
  strbuf_alloc(&sbuf, 128);

  for(p = 0; p < crawler.num_paths; p++) {
    db_node_buf_reset(&nbuf);
    graph_crawler_get_path_nodes(&crawler, p, &nbuf);
    strbuf_ensure_capacity(&sbuf, nbuf.len+graph.kmer_size);
    sbuf.end = db_nodes_to_str(nbuf.b, nbuf.len, &graph, sbuf.b);
    for(i = 0; i < 3 && strcmp(graphseq[i]+1,sbuf.b) != 0; i++) {}
    TASSERT2(i < 3, "seq: %s", sbuf.b);
    TASSERT2(sbuf.end == 75, "sbuf.end: %zu", sbuf.end);
    TASSERT2(nbuf.len == 65, "nbuf.len: %zu", nbuf.len);
  }

  strbuf_dealloc(&sbuf);
  db_node_buf_dealloc(&nbuf);

  graph_crawler_dealloc(&crawler);

  db_graph_dealloc(&graph);
}
static void test_repeat_loop()
{
  TASSERT(sizeof(FollowPath) == 20);

  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  size_t kmer_size = 11, ncols = 1;

  // Set up alignment correction params
  CorrectAlnParam params = {.ctpcol = 0, .ctxcol = 0,
                            .ins_gap_min = 0, .ins_gap_max = 0,
                            .one_way_gap_traverse = true, .use_end_check = true,
                            .max_context = 10,
                            .gap_variance = 0.1, .gap_wiggle = 5};

  // Sequence with repeat
  char seq[] = "ATTTGGAACTCCGGA"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "GATAGGGCCAGT"
               "CGTCAGGAGCTAACT";

  char p0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT";
  char p1[] = "GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT";

  // Allocate graph, but don't add any sequence
  _construct_graph_with_paths(&graph, kmer_size, ncols, NULL, 0, params);

  GenPathWorker *gen_path_wrkr = gen_paths_workers_alloc(1, &graph, NULL);

  GraphWalker gwlk;
  RepeatWalker rptwlk;
  graph_walker_alloc(&gwlk);
  rpt_walker_alloc(&rptwlk, graph.ht.capacity, 12);

  dBNodeBuffer nbuf;
  db_node_buf_alloc(&nbuf, 1024);

  // Construct graph but no paths
  build_graph_from_str_mt(&graph, 0, seq, strlen(seq));
  TASSERT2(graph.ht.num_kmers == 15+12+15, "%zu", (size_t)graph.ht.num_kmers);

  // Find first node in sequence
  dBNode node0 = db_graph_find_str(&graph, seq);
  TASSERT(node0.key != HASH_NOT_FOUND);

  // 1) With no paths
  char ans0[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT";
  test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+2, ans0);

  // 2) Add small paths - produces collapsed down seq with two copy repeat
  gen_paths_from_str_mt(gen_path_wrkr, p0, params);
  gen_paths_from_str_mt(gen_path_wrkr, p1, params);
  char ans1[] = "ATTTGGAACTCCGGA""GATAGGGCCAGT""GATAGGGCCAGT""CGTCAGGAGCTAACT";
  test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, 15+12+12+5, ans1);

  // 3) Add long paths
  gen_paths_from_str_mt(gen_path_wrkr, seq, params);
  test_walk(&gwlk, &rptwlk, node0, &nbuf, &graph, strlen(seq)+1-kmer_size, seq);

  graph_walker_dealloc(&gwlk);
  rpt_walker_dealloc(&rptwlk);
  db_node_buf_dealloc(&nbuf);
  gen_paths_workers_dealloc(gen_path_wrkr, 1);
  db_graph_dealloc(&graph);
}

void test_repeat_walker()
{
  test_status("Testing repeat_walker.h");
  test_repeat_loop();
}
Beispiel #7
0
static void test_kmer_occur_filter()
{
  // Construct 1 colour graph with kmer-size=11
  dBGraph graph;
  const size_t kmer_size = 11, ncols = 3;
  size_t i;

  // Create graph
  db_graph_alloc(&graph, kmer_size, ncols, 1, 2000,
                 DBG_ALLOC_EDGES | DBG_ALLOC_NODE_IN_COL | DBG_ALLOC_BKTLOCKS);

  //      xyz------->>>      y         >  <         X
  // TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA

  #define NUM_NODES 3
  #define NUM_READS 3

  const char *tmp[NUM_READS]
  = {
    "AACA",
    "TTCGACCCGACAGGGCAACGTAGTCCGACAGGGCACAGCCCTGTCGGGGGGTGCA",
    "TCTAGCATGTGTGTT"};

  read_t reads[NUM_READS];
  for(i = 0; i < NUM_READS; i++) {
    seq_read_alloc(&reads[i]);
    seq_read_set(&reads[i], tmp[i]);
  }

  KOGraph kograph = kograph_create(reads, NUM_READS, true, 0, 1, &graph);

  TASSERT(kograph.nchroms == NUM_READS);
  TASSERT(kograph.koccurs != NULL);

  KOccurRunBuffer koruns, koruns_tmp, koruns_ended;
  korun_buf_alloc(&koruns, 16);
  korun_buf_alloc(&koruns_tmp, 16);
  korun_buf_alloc(&koruns_ended, 16);

  // Check CCCGACAGGGCAA starts at CCCGACAGGGC
  // x=CCCGACAGGGC, y=CCGACAGGGCA, z=CGACAGGGCAA
  // X=GCCCTGTCGGG, Y=TGCCCTGTCGG, Z=TTGCCCTGTCG
  dBNode nodes[NUM_NODES];
  for(i = 0; i < NUM_NODES; i++)
    nodes[i] = db_graph_find_str(&graph, &"CCCGACAGGGCAA"[i]);

  korun_buf_reset(&koruns);
  korun_buf_reset(&koruns_ended);
  kograph_filter_extend(&kograph, nodes, NUM_NODES, true, 0, 0,
                        &koruns, &koruns_tmp, &koruns_ended);

  // Checks
  TASSERT2(koruns.len == 1, "koruns.len: %zu", koruns.len);
  TASSERT(koruns.b[0].strand == STRAND_PLUS); // left-to-right with ref
  TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom);
  TASSERT2(koruns.b[0].first == 5, "offset: %zu", (size_t)koruns.b[0].first);
  TASSERT2(koruns.b[0].last == 7, "last: %zu", (size_t)koruns.b[0].last);

  // Test reverse
  db_nodes_reverse_complement(nodes, NUM_NODES);

  korun_buf_reset(&koruns);
  korun_buf_reset(&koruns_ended);
  kograph_filter_extend(&kograph, nodes, 1, true, 0, 0, &koruns, &koruns_tmp, &koruns_ended);
  kograph_filter_extend(&kograph, nodes+1, 1, true, 0, 1, &koruns, &koruns_tmp, &koruns_ended);
  kograph_filter_extend(&kograph, nodes+2, 1, true, 0, 2, &koruns, &koruns_tmp, &koruns_ended);

  // Print out for debugging
  // printf("koruns: ");
  // koruns_print(koruns.b, koruns.len, kmer_size, stdout);
  // printf("\nkoruns_ended: ");
  // koruns_print(koruns_ended.b, koruns_ended.len, kmer_size, stdout);
  // printf("\n");

  // Check results match:
  // koruns: chromid:1:17-5:-, chromid:1:37-47:+
  // koruns_ended: chromid:1:34-24:-
  TASSERT2(koruns.len == 2, "koruns.len: %zu", koruns.len);
  TASSERT2(koruns_ended.len == 1, "koruns_ended.len: %zu", koruns_ended.len);
  TASSERT(koruns.b[0].strand == STRAND_MINUS); // reverse complement of ref
  TASSERT2(koruns.b[0].chrom == 1, "chrom: %zu", (size_t)koruns.b[0].chrom);
  TASSERT2(koruns.b[0].first == 7, "offset: %zu", (size_t)koruns.b[0].first);
  TASSERT2(koruns.b[0].last == 5, "last: %zu", (size_t)koruns.b[0].last);

  korun_buf_dealloc(&koruns);
  korun_buf_dealloc(&koruns_tmp);
  korun_buf_dealloc(&koruns_ended);

  for(i = 0; i < NUM_READS; i++) seq_read_dealloc(&reads[i]);
  kograph_dealloc(&kograph);

  db_graph_dealloc(&graph);
}